# Text mining step by step

This is a step by step notebook to show how the module we created before is working. 

## 0. - Loading Libraries

In [62]:
#from typing import  Iterable, List, Set
from typing import Any, Iterable, List, Optional, Set, Tuple

#from vectors import Vector
import math

from itertools import groupby
from operator import itemgetter
import re
import sys
import time


## 1. - Declaring variables

In [63]:
Vector = List[float]

# >>>>  Most common dimension

# We want to ignore these characters,
# so that e.g. "U.S.", "U.S", "US_" and "US" are the same word.
ignore_char_regex = re.compile("[\W_]")

# Has to start and end with an alphanumeric character
is_valid_word = re.compile("^[^\W_].*[^\W_]$")

## 2. - Declaring classes &  functions

### 2.1 - "Vectors" function


In [64]:
def l2_len(v: Vector) -> float:
    return math.sqrt(sum([x*x for x in v]))

def dot(v1: Vector, v2: Vector) -> float:
    assert len(v1) == len(v2)
    return sum([x*y for (x,y) in zip(v1, v2)])

def vec_add(v1: Vector, v2: Vector) -> Vector:
    assert len(v1) == len(v2)
    return [x + y for (x,y) in zip(v1, v2)]

def vec_sub(v1: Vector, v2: Vector) -> Vector:
    assert len(v1) == len(v2)
    return [x - y for (x,y) in zip(v1, v2)]

def vec_normalize(v: Vector) -> Vector:
    l = l2_len(v)
    return [x / l for x in v]

def cosine_similarity_normalized(v1: Vector, v2: Vector) -> float:
    """
    Returns the cosine of the angle between the two vectors.
    Each of the vectors must have length (L2-norm) equal to 1.
    Results range from -1 (very different) to 1 (very similar).
    """
    return dot(v1, v2)


### 2.2. -"Word" Class

In [65]:
class Word:

    """A single word (one line of the input file)"""

    def __init__(self, text: str, vector: Vector, frequency: int) -> None:
        self.text = text
        self.vector = vector
        self.frequency = frequency

    def __repr__(self) -> str:
        vector_preview = ', '.join(map(str, self.vector[:2]))
        return f"{self.text} [{vector_preview}, ...]"

### 2.3. - "Load" Functions

In [66]:
"""
Load the input file (see https://fasttext.cc/docs/en/english-vectors.html)
and do some cleanup.
"""

#from typing import Iterable, List, Set

#from itertools import groupby
#from operator import itemgetter
#import re
#import vectors as v
#from word import Word

def load_words(file_path: str) -> List[Word]:
    """Load and cleanup the data."""
    print(f"Loading {file_path}...")
    words = load_words_raw(file_path)
    print(f"Loaded {len(words)} words.")

    #num_dimensions = most_common_dimension(words)
    words = [w for w in words if len(w.vector) == 300]
    #print(f"Using {num_dimensions}-dimensional vectors, {len(words)} remain.")

    words = remove_stop_words(words)
    print(f"Removed stop words, {len(words)} remain.")

    words = remove_duplicates(words)
    print(f"Removed duplicates, {len(words)} remain.")

    return words

def load_words_raw(file_path: str) -> List[Word]:
    """Load the file as-is, without doing any validation or cleanup."""
    def parse_line(line: str, frequency: int) -> Word:
        tokens = line.split()
        word = tokens[0]
        vector = vec_normalize([float(x) for x in tokens[1:]])
        return Word(word, vector, frequency)

    words = []
    # Words are sorted from the most common to the least common ones
    frequency = 1
    with open(file_path, encoding="utf8") as f:
        for line in f:
            w = parse_line(line, frequency)
            words.append(w)
            frequency += 1
    return words

def iter_len(iter: Iterable[complex]) -> int:
    return sum(1 for _ in iter)

def most_common_dimension(words: List[Word]) -> int:
    """
    There is a line in the input file which is missing a word
    (search -0.0739, -0.135, 0.0584).
    """
    lengths = sorted([len(word.vector) for word in words])
    dimensions = [(k, iter_len(v)) for k, v in groupby(lengths)]
    print("Dimensions:")
    for (dim, num_vectors) in dimensions:
        print(f"{num_vectors} {dim}-dimensional vectors")
    most_common = sorted(dimensions, key=lambda t: t[1], reverse=True)[0]
    return most_common[0]


def remove_duplicates(words: List[Word]) -> List[Word]:
    seen_words: Set[str] = set()
    unique_words: List[Word] = []
    for w in words:
        canonical = ignore_char_regex.sub("", w.text)
        if not canonical in seen_words:
            seen_words.add(canonical)
            # Keep the original ordering
            unique_words.append(w)
    return unique_words

def remove_stop_words(words: List[Word]) -> List[Word]:
    return [w for w in words if (
        len(w.text) > 1 and is_valid_word.match(w.text))]




### 2.4 "cosinesimilarity10words" FUNCTION


In [67]:
######
# 1) # Loading required libraries
######

#from typing import Any, Iterable, List, Optional, Set, Tuple

#from load import load_words
#import math
#import vectors as v
#from vectors import Vector
#from word import Word

# Timing info for most_similar (100k words):
# Original version: 7.3s
# Normalized vectors: 3.4s

In [68]:
######
# 2) # Declaring functions
######

# 2.1
def most_similar(base_vector: Vector, words: List[Word]) -> List[Tuple[float, Word]]:
    """Finds n words with smallest cosine similarity to a given word"""
    words_with_distance = [(cosine_similarity_normalized(base_vector, w.vector), w) for w in words]
    # We want cosine similarity to be as large as possible (close to 1)
    sorted_by_distance = sorted(words_with_distance, key=lambda t: t[0], reverse=True)
    return sorted_by_distance

In [69]:
# 2.2

def print_most_similar(words: List[Word], text: str) -> None:
    base_word = find_word(text, words)

    if not base_word:
        print(f"Uknown word: {text}")
        return
    print(f"\n\tWords related to {base_word.text}:")
    sorted_by_distance = [
        word.text for (dist, word) in
            most_similar(base_word.vector, words)
            if word.text.lower() != base_word.text.lower()
        ]
    print(', '.join(sorted_by_distance[:10]))

In [70]:
# 2.4

def find_word(text: str, words: List[Word]) -> Optional[Word]:
    try:
       return next(w for w in words if text == w.text)
    except StopIteration:
       return None

In [71]:
# 2.5

def closest_analogies(
    left2: str, left1: str, right2: str, words: List[Word]
) -> List[Tuple[float, Word]]:
    word_left1 = find_word(left1, words)
    word_left2 = find_word(left2, words)
    word_right2 = find_word(right2, words)
    if (not word_left1) or (not word_left2) or (not word_right2):
        return []
    vector = vec_add(
        vec_sub(word_left1.vector, word_left2.vector),
        word_right2.vector)
    closest = most_similar(vector, words)[:10]
    def is_redundant(word: str) -> bool:
        """
        Sometimes the two left vectors are so close the answer is e.g.
        "shirt-clothing is like phone-phones". Skip 'phones' and get the next
        suggestion, which might be more interesting.
        """
        word_lower = word.lower()
        return (
            left1.lower() in word_lower or
            left2.lower() in word_lower or
            right2.lower() in word_lower)
    closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]
    return closest_filtered

In [72]:
#2.6

def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None:
    analogies = closest_analogies(left2, left1, right2, words)
    if (len(analogies) == 0):
        print(f"\t{left2}-{left1} is like {right2}-?\n")
    else:
        (dist, w) = analogies[0]
        #alternatives = ', '.join([f"{w.text} ({dist})" for (dist, w) in analogies])
        print(f"\t{left2}-{left1} is like {right2}-{w.text}\n")

## 4. - Executing code

### Testing code for functions previously loaded

In [73]:


# >>>> remove stop words FUNCTION

# Run "smoke tests" on import - 
assert [w.text for w in remove_stop_words([
    Word('a', [], 1),
    Word('ab', [], 1),
    Word('-ab', [], 1),
    Word('ab_', [], 1),
    Word('a.', [], 1),
    Word('.a', [], 1),
    Word('ab', [], 1),
])] == ['ab', 'ab']
assert [w.text for w in remove_duplicates([
    Word('a.b', [], 1),
    Word('-a-b', [], 1),
    Word('ab_+', [], 1),
    Word('.abc...', [], 1),
])] == ['a.b', '.abc...']



This is is the test script from original source to see how accurate are the predictions with some examples.
It does not need any input argument. Just run. To introduce an argument for predic, better run in command prompt:

**cosinesimilarity10words.py + wordTarget**



In [75]:
######
# 3) # Load vec file
######
start_time = time.time()

#O# words = load_words('data/words.vec')
words = load_words('../data/wordvectors/words_test2.vec') #Running the biggest wordvector provided by fasttext
print("--- %s seconds ---" % (time.time() - start_time))

Loading ../data/wordvectors/words_test2.vec...
Loaded 120221 words.
Removed stop words, 118458 remain.
Removed duplicates, 116380 remain.
--- 29.106871843338013 seconds ---


### Getting 10 similar words to...

In [77]:
# Print  the 10 most similar words to SUGAR!
start_time = time.time()
print_most_similar(words, "sugar")# str(sys.argv[1]))
print("\n--- %s seconds ---" % (time.time() - start_time))


	Words related to sugar:
sugars, sucrose, syrup, glucose, molasses, fructose, sugary, sugarcane, carbohydrates, cocoa


In [78]:
# Print  the 10 most similar words to DIABETES!
start_time = time.time()
print_most_similar(words, "diabetes")# str(sys.argv[1]))
print("\n--- %s seconds ---" % (time.time() - start_time))


	Words related to diabetes:
diabetic, diabetics, hypertension, mellitus, obesity, insulin, hypoglycemia, asthma, Diabetic, disease

--- 5.120102643966675 seconds ---
