In [1]:
text = 'Yesterday the weather was sunny and windy. Today it is sunny and windy too.'

In [2]:
def tokenize(text: str) -> list:
    import re
    try:
        return re.findall('\w+', text.lower())
    except AttributeError:
        return []
    """
    Splits sentences into tokens, converts the tokens into lowercase, removes punctuation
    :param text: the initial text
    :return: a list of lowercased tokens without punctuation
    e.g. text = 'The weather is sunny, the man is happy.'
    --> ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy']
    """

tok_text = tokenize(text)
tok_text

['yesterday',
 'the',
 'weather',
 'was',
 'sunny',
 'and',
 'windy',
 'today',
 'it',
 'is',
 'sunny',
 'and',
 'windy',
 'too']

In [3]:
def remove_stop_words(tokens: list, stop_words: list) -> list:
    if type(tokens) is list and all(type(s) is str for s in tokens):    # check tokens
        if type(stop_words) is list and all(type(s) is str for s in stop_words):    # check stop-words
            return [word for word in tokens if word not in stop_words]
        else:
            return tokens
    else:
        return []

    """
    Removes stop words
    :param tokens: a list of tokens
    :param stop_words: a list of stop words
    :return: a list of tokens without stop words
    e.g. tokens = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy']
    stop_words = ['the', 'is']
    --> ['weather', 'sunny', 'man', 'happy']
    """


stop_words = open('stop_words.txt').read().split()
clean_text = remove_stop_words(tok_text, stop_words)
clean_text

['yesterday', 'weather', 'sunny', 'windy', 'today', 'sunny', 'windy']

In [95]:
def calculate_frequencies(tokens: list) -> dict:
    d = {}
    if type(tokens) is list and all(type(s) is str for s in tokens):    # check tokens
        for word in set(tokens):
            d[word] = tokens.count(word)
        d = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))
    return d
        
    
    
    """
    Calculates frequencies of given tokens
    :param tokens: a list of tokens without stop words
    :return: a dictionary with frequencies
    e.g. tokens = ['weather', 'sunny', 'man', 'happy']
    --> {'weather': 1, 'sunny': 1, 'man': 1, 'happy': 1}
    """

freq_dict = calculate_frequencies(clean_text)
freq_dict

{'windy': 2, 'sunny': 2, 'weather': 1, 'yesterday': 1, 'today': 1}

In [30]:
def get_top_n_words(freq_dict: dict, top_n: int) -> list:
    if type(freq_dict) is dict and all(type(s) is str for s in freq_dict) and type(top_n) is int:    # check freq_dict
        return list(freq_dict.keys())[:top_n]
    return []
    
    """
    Returns the most common words
    :param freq_dict: a dictionary with frequencies
    :param top_n: a number of the most common words to return
    :return: a list of the most common words
    e.g. tokens = ['weather', 'sunny', 'man', 'happy', 'and', 'dog', 'happy']
    top_n = 1
    --> ['happy']
    """

top = get_top_n_words(freq_dict, 3)
top

['sunny', 'windy', 'today']

In [53]:
def get_concordance(tokens: list, word: str, left_context_size: int, right_context_size: int) -> list:
    if type(tokens) is list and all(type(s) is str for s in tokens) and word in tokens:    # check tokens and word
        if left_context_size >= 1 or right_context_size >= 1:
            idx = [i for i, x in enumerate(tokens) if x == word]
            conc = []
            for i in idx:
                conc.append(tokens[i-left_context_size:i+right_context_size+1])
            return conc
    return []
    
    """
    Gets a concordance of a word
    A concordance is a listing of each occurrence of a word in a text,
    presented with the words surrounding it
    :param tokens: a list of tokens
    :param word: a word-base for a concordance
    :param left_context_size: the number of words in the left context
    :param right_context_size: the number of words in the right context
    :return: a concordance
    e.g. tokens = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy',
                    'the', 'dog', 'is', 'happy', 'but', 'the', 'cat', 'is', 'sad']
    word = 'happy'
    left_context_size = 2
    right_context_size = 3
    --> [['man', 'is', 'happy', 'the', 'dog', 'is'], ['dog', 'is', 'happy', 'but', 'the', 'cat']]
    """
conc = get_concordance(tok_text, 'sunny', 3, 1)
conc

[['the', 'weather', 'was', 'sunny', 'and'],
 ['today', 'it', 'is', 'sunny', 'and']]

In [61]:
def get_adjacent_words(tokens: list, word: str, left_n: int, right_n: int) -> list:
    conc = get_concordance(tokens, word, left_n, right_n)
    if left_n == 0:
        return [[elem[-1]] for elem in conc]
    elif right_n == 0:
        return [[elem[0]] for elem in conc]
    else:
        return [[elem[0], elem[-1]] for elem in conc]
    
    """
    Gets adjacent words from the left and right context
    :param tokens: a list of tokens
    :param word: a word-base for the search
    :param left_n: the distance between a word and an adjacent one in the left context
    :param right_n: the distance between a word and an adjacent one in the right context
    :return: a list of adjacent words
    e.g. tokens = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy',
                    'the', 'dog', 'is', 'happy', 'but', 'the', 'cat', 'is', 'sad']
    word = 'happy'
    left_n = 2
    right_n = 3
    --> [['man', 'is'], ['dog, 'cat']]
    """

adj = get_adjacent_words(tok_text, 'sunny', 3, 1)
adj

[['the', 'and'], ['today', 'and']]

In [84]:
def read_from_file(path_to_file: str) -> str:
    """
    Opens the file and reads its content
    :return: the initial text in string format
    """
    with open(path_to_file, 'r', encoding='utf-8') as fs:
        data = fs.read()

    return data


def write_to_file(path_to_file: str, content: list):
    import os
    with open(os.path.join(path_to_file, 'report.txt'), 'w', encoding='utf-8') as fs:
        fs.write('\n'.join([' '.join(k) for k in content]))

write_to_file('', conc)

In [105]:
def sort_concordance(tokens: list, word: str, left_context_size: int, right_context_size: int, left_sort: bool) -> list:
    if type(left_sort) == bool:
        conc = get_concordance(tokens, word, left_context_size, right_context_size)
        if left_sort:
            return sorted(conc, key=lambda x: x[0])
        print(conc[0][-right_context_size])
        return sorted(conc, key=lambda x: x[-right_context_size])
    return []
    """
    Gets a concordance of a word and sorts it by either left or right context
    :param tokens: a list of tokens
    :param word: a word-base for a concordance
    :param left_context_size: the number of words in the left context
    :param right_context_size: the number of words in the right context
    :param left_sort: if True, sort by the left context, False – by the right context
    :return: a concordance
    e.g. tokens = ['the', 'weather', 'is', 'sunny', 'the', 'man', 'is', 'happy',
                    'the', 'dog', 'is', 'happy', 'but', 'the', 'cat', 'is', 'sad']
    word = 'happy'
    left_context_size = 2
    right_context_size = 3
    left_sort = True
    --> [['dog', 'is', 'happy', 'but', 'the', 'cat'], ['man', 'is', 'happy', 'the', 'dog', 'is']]
    """

In [111]:
sort_concordance(tok_text, 'windy', 3, 2, True)

[['is', 'sunny', 'and', 'windy', 'too'],
 ['was', 'sunny', 'and', 'windy', 'today', 'it']]

In [98]:
conc = [['was', 'sunny', 'and'], ['is', 'sunny', 'and']]
sorted(conc, key=lambda x: x[0])

[['is', 'sunny', 'and'], ['was', 'sunny', 'and']]