# TAHLR Week 8: Feature Engineering and Syntactic Similarity

Code notebook for TAHLR course at ISAW (Fall 2023) based on Albrecht et al. 2022 (Blueprints) Ch. 5

In [None]:
# Imports

import pandas as pd
import numpy as np
import spacy
from pprint import pprint
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

# spacy.cli.download('en_core_web_sm') # Download if necessary

In [None]:
# Make a list of sentences

sentences = ["It was the best of times",
             "it was the worst of times",
             "it was the age of wisdom",
             "it was the age of foolishness"]


In [None]:
# Tokenize sentences

tokenized_sentences = [[t for t in sentence.split()] for sentence in sentences]
pprint(tokenized_sentences)

In [None]:
# Get vocabulary
vocabulary = list(set([w for s in tokenized_sentences for w in s]))
pprint(vocabulary)

In [None]:
# Show datafram 

pd.DataFrame([[w, i] for i,w in enumerate(vocabulary)])

In [None]:
# Vectorize documents

def onehot_encode(tokenized_sentence):
    return [1 if w in tokenized_sentence else 0 for w in vocabulary]

onehot = [onehot_encode(tokenized_sentence)
         for tokenized_sentence in tokenized_sentences]

for (sentence, oh) in zip(sentences, onehot):
    print("%s: %s" % (oh, sentence))

In [None]:
# Deal with OOV

onehot_encode("John likes to watch movies. Mary likes movies too.".split())

In [None]:
# Show OHE matrix

pd.DataFrame(onehot, columns=vocabulary)

In [None]:
# Calculating "bitwise" similarity

sim = [onehot[0][i] & onehot[1][i] for i in range(0, len(vocabulary))]
sum(sim)

In [None]:
# Calculating similarity, dot product

np.dot(onehot[0], onehot[1])

In [None]:
# Calculating similarity matrix

np.dot(onehot, np.transpose(onehot))

In [None]:
# One-hot encoding with scikit-learn

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform(tokenized_sentences)

## Blueprint: Using scikit-learn's CountVectorizer

In [None]:
# Bag of words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()


In [None]:
more_sentences = sentences + \
                 ["John likes to watch movies. Mary likes movies too.",
                  "Mary also likes to watch football games."]

In [None]:
# Fit sentences

cv.fit(more_sentences)

In [None]:
# Show params

pprint(cv.get_params())

In [None]:
# Show "vocabulary"

print(cv.get_feature_names_out())

In [None]:
# Transform sentences

dt = cv.transform(more_sentences)
dt

In [None]:
# Make DTM

pd.DataFrame(dt.toarray(), columns=cv.get_feature_names_out())

## Blueprint: Calculating similarities

In [None]:
# Calculate cosine similarity

from sklearn.metrics.pairwise import cosine_similarity

print(more_sentences[0])
print(more_sentences[1])

cosine_similarity(dt[0], dt[1])

In [None]:
# Similarity matrix

pd.DataFrame(cosine_similarity(dt, dt))

In [None]:
# TF-IDF models

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
tfidf_dt = tfidf.fit_transform(dt)
pd.DataFrame(tfidf_dt.toarray(), columns=cv.get_feature_names_out())

In [None]:
# Show similarity matrix

pd.DataFrame(cosine_similarity(tfidf_dt, tfidf_dt))

## Blueprint: Reducting feature dimensions

In [None]:
# Get data from remote location

!mkdir -p ../data/blueprints
!curl -LJO https://github.com/blueprints-for-text-analytics-python/blueprints-text/raw/master/data/abcnews/abcnews-date-text.csv.gz --output-dir ../data/blueprints

In [None]:
# Read data

headlines = pd.read_csv("../data/blueprints/abcnews-date-text.csv.gz", parse_dates=["publish_date"])
print(len(headlines))
headlines.head()

In [None]:
# TF-IDF vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
dt = tfidf.fit_transform(headlines["headline_text"])

In [None]:
# "Show" matrix
dt

In [None]:
# The number of rows was expected, but the number of columns (the vocabulary) is really large, with almost 100,000 words. Doing the math shows that a naive storage of data would have led to  1,103,663 * 95,878 elements with 8 bytes per float and have used roughly 788 GB RAM. This shows the incredible effectiveness of sparse matrices as the real memory used is “only” 56,010,856 bytes (roughly 0.056 GB; found out via dt.data.nbytes). It’s still a lot, but it’s manageable.

In [None]:
# Stopwords

from spacy.lang.en.stop_words import STOP_WORDS as stopwords
stopwords = [w for w in stopwords if w.isalpha()]
print(len(stopwords))

tfidf = TfidfVectorizer(stop_words=list(stopwords))
dt = tfidf.fit_transform(headlines["headline_text"])
dt

In [None]:
# Minimum frequency

tfidf = TfidfVectorizer(stop_words=stopwords, min_df=2)
dt = tfidf.fit_transform(headlines["headline_text"])
dt

## Blueprint: Improving Features by Making Them More Specific

In [None]:
# Performing linguistic analysis with spaCy

import spacy

nlp = spacy.load("en_core_web_sm")
nouns_adjectives_verbs = ["NOUN", "PROPN", "ADJ", "ADV", "VERB"]

def lemmatize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

def lemmatize_nouns_adjectives_verbs(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc
                     if token.pos_ in nouns_adjectives_verbs])

ROWS = 100000

headlines = headlines.sample(ROWS, random_state=42)

headlines['lemmas'] = headlines['headline_text'].progress_apply(lemmatize)
headlines['nav'] = headlines['headline_text'].progress_apply(lemmatize_nouns_adjectives_verbs)

## Blueprint: Using Lemmas Instead of Words for Vectorizing Documents

In [None]:
# TF-IDF vectorization after lemmatization

tfidf = TfidfVectorizer(stop_words=stopwords)
dt = tfidf.fit_transform(headlines["lemmas"].map(str))
dt

## Blueprint: Adding context via n-grams



In [None]:
# TF-IDF vectorization with ngams

tfidf = TfidfVectorizer(ngram_range=(1,2),
        stop_words=stopwords)
dt = tfidf.fit_transform(headlines["nav"].map(str))
dt

In [None]:
# Show matrix

pd.DataFrame(dt.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
# Using custom stopwords

# NB: There are "test" headlines in the corpus
stopwords += ['test']
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=(1,2), min_df=2, \
                        norm='l2')
dt = tfidf.fit_transform(headlines["headline_text"])

## Blueprint: Finding Most Similar Headlines to a Made-up Headline

In [None]:
# Make up new headline; transform

made_up = tfidf.transform(["what is happening in sydney"])

In [None]:
# Get similarity of this made-up headline to all other headlines

sim = cosine_similarity(made_up, dt)

In [None]:
# Return the row with the most similar headline, i.e. the one with the highest similarity

np.argmax(sim)

In [None]:
# Show that row

headlines.iloc[np.argmax(sim)]

In [None]:
# Another example

made_up = tfidf.transform(["what is happening at the sydney opera house"])
sim = cosine_similarity(made_up, dt)
headlines.iloc[np.argmax(sim)]

## Blueprint: Finding the Two Most Similar Documents in a Large Corpus (Much More Difficult)

In [None]:
# You might think that finding the most similar documents in the corpus is as easy as calculating the cosine_similarity between all documents. However, this is not possible as 1,103,663 × 1,103,663 = 1,218,072,017,569. More than one trillion elements do not fit in the RAM of even the most advanced computers. It is perfectly possible to perform the necessary matrix multiplications without having to wait for ages.

In [None]:
# What we are starting with

dt

In [None]:
# A process to batch and progressively calculate the similarity matrix

%%time
batch = 10000
max_sim = 0.0
max_a = None
max_b = None
for a in range(0, dt.shape[0], batch):
    for b in range(0, a+batch, batch):
        print(a, b)
        r = np.dot(dt[a:a+batch], np.transpose(dt[b:b+batch]))
        # eliminate identical vectors
        # by setting their similarity to 0 which gets sorted out
        r[r > 0.9999] = 0
        sim = r.max()
        if sim > max_sim:
            # argmax returns a single value which we have to
            # map to the two dimensions
            (max_a, max_b) = np.unravel_index(np.argmax(r), r.shape)
            # adjust offsets in corpus (this is a submatrix)
            max_a += a
            max_b += b
            max_sim = sim

In [None]:
# Get the batched similarity best result

print(headlines.iloc[max_a])
print(headlines.iloc[max_b])

## Blueprint: Finding Related Words

In [None]:
# TF-IDF for limited vocabulary

tfidf_word = TfidfVectorizer(stop_words=stopwords, min_df=100)
dt_word = tfidf_word.fit_transform(headlines["headline_text"])
dt_word.shape

In [None]:
# Get similarity matrix

# NB: With ~1000 results we can do this in memory
r = cosine_similarity(dt_word.T, dt_word.T)
np.fill_diagonal(r, 0)

In [None]:
# Prep data

voc = tfidf_word.get_feature_names_out()
size = r.shape[0] # quadratic
print(size)

In [None]:
# Explore matrix

print(r)
print()
print(r.shape)

In [None]:
# Explore matrix, 2

print(r[0])
print(r[1])

In [None]:
# Explore flattened matrix

print(list(r.flatten()[:3]))
print(list(r.flatten()[1030:1030+3]))

In [None]:
# Get top 40 results by position

argsorts = np.argsort(r.flatten())[::-1][:40]
argsorts

In [None]:
# Look at dividing flattened matrix by original size

print(argsorts[0])
print(argsorts[0] / size)
voc[int(argsorts[0] / size)]

In [None]:
# Look at dividing flattened matrix by original size, 2

print(int(argsorts[0] % size))
voc[int(argsorts[0] % size)]

In [None]:
# Go through max args and print related words

for index in np.argsort(r.flatten())[::-1][0:40]:
    a = int(index/size)
    b = index%size
    if a > b:  # avoid repetitions
        # print(index)
        # print(size)
        # print(a, b)
        print('"%s" related to "%s"' % (voc[a], voc[b]))