1. Find the **most similar sentences or documents in your dataset using word count, TF-IDF, and word-embeddings** as your vectorization techniques. If the computation is slow, **you may subsample** for only a few thousand rows. (2 pts)



In [12]:
import pandas as pd
import numpy as np

In [13]:
# load in spacy
import en_core_web_md
import spacy
from scipy.spatial.distance import cosine
nlp = en_core_web_md.load()

In [14]:
# Preprocess the reviews (tokenizing, lemmatization, removing stopwords)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

def preprocessing(titles):
    filtered_titles = []
    for title in titles:
        title = title.lower()
        token_list = word_tokenize(title) # Tokenize
        filtered_token = [t for t in token_list if not t in stop_words] # Remove stopwords
        for i in range(len(filtered_token)):
            filtered_token[i] = lemmatizer.lemmatize(filtered_token[i]).strip(string.punctuation) # Lemmatization
        filtered_titles.append(" ".join(filtered_token))
    return filtered_titles

In [15]:
# load dataset
data = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv")

## TF-IDF weighted Word2Vec

In [16]:
# TF-IDF vectorizer
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

filtered_corpus = preprocessing(data["title"])
vectorizer = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern=r'\b[a-zA-Z]{3,}\b',
                             max_df = 0.4, max_features = 2000) # only use first 2000 features because of 
                                                                # computatioal complexity later on

# vectorize the corpus
vector = vectorizer.fit_transform(filtered_corpus)

In [17]:
# TF-IDF matrix
tfidf_matrix = pd.DataFrame(vector.toarray(), columns = vectorizer.get_feature_names())
tfidf_matrix.head()

Unnamed: 0,abbas,abbott,abducted,abe,able,abortion,abroad,abu,abuse,accept,...,yemeni,yet,york,young,youth,youtube,zealand,zika,zimbabwe,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
# Word embeddings for each word in the column index of TF-IDF matrix
word2vec = [np.array(nlp(i).vector) for i in tfidf_matrix.columns]

In [19]:
# For each title, use each word's TF-IDF mutliply by its word embeddings vector and sum all the word vectors
# The result is an unweighted matrix for each title
unweighted_matrix = pd.DataFrame(np.dot(tfidf_matrix,np.array(word2vec)))
unweighted_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.911301,0.486986,0.47612,0.289969,0.054064,-0.871087,-0.268528,0.244177,0.637909,3.76067,...,0.146789,-0.250062,-0.508864,0.121688,0.368278,0.258315,-0.64747,-0.213734,0.223185,0.038268
1,-0.322963,-0.390803,-0.14191,0.469323,0.105719,0.086175,-0.485631,0.41103,0.093058,3.28992,...,0.568587,0.480326,0.255286,0.011377,-0.005456,-0.603722,-0.348473,0.06443,-0.196244,-0.361122
2,-0.392676,0.21415,0.293005,-0.26418,0.723595,-0.40028,-0.585914,0.070288,-0.411179,2.270911,...,0.122963,0.365254,-0.285185,-0.456324,0.554745,-0.655569,-0.201145,0.353117,-0.321552,0.659523
3,-0.495491,0.646836,-0.114876,-0.209703,-0.494729,-0.815203,-0.022069,-0.126512,0.217766,6.272454,...,-0.395573,0.316181,-0.125221,-0.427649,0.13686,-0.184962,-0.489139,-0.181235,0.021316,0.072185
4,-0.520796,0.04037,0.592133,0.061717,0.587142,0.026757,-0.25465,0.443506,-0.203645,4.090255,...,-0.18759,0.022927,0.390571,-0.07209,-0.125543,0.185563,0.015684,0.147251,-0.223659,0.559055


In [22]:
# For each title, use unweighted matrix divided by the sum of that title's TF-IDF to get weighted word2vec matrix
# The result is our final word2vec matrix
final_w2v = unweighted_matrix.div(tfidf_matrix.sum(axis=1), axis=0)
final_w2v = final_w2v.fillna(0)
final_w2v.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.464902,0.248437,0.242894,0.147928,0.027581,-0.444387,-0.13699,0.124567,0.325431,1.918514,...,0.074885,-0.12757,-0.259598,0.062079,0.187878,0.13178,-0.330308,-0.109037,0.113859,0.019522
1,-0.188874,-0.228548,-0.082991,0.274468,0.061826,0.050396,-0.284005,0.240377,0.054422,1.923997,...,0.332518,0.280902,0.149295,0.006654,-0.003191,-0.353066,-0.203792,0.03768,-0.114766,-0.21119
2,-0.197422,0.107666,0.147312,-0.132819,0.363795,-0.201245,-0.294575,0.035338,-0.206725,1.141724,...,0.061821,0.183635,-0.14338,-0.229422,0.278904,-0.329594,-0.101128,0.177533,-0.161663,0.331582
3,-0.222322,0.290229,-0.051544,-0.094092,-0.22198,-0.365773,-0.009902,-0.056765,0.097709,2.814384,...,-0.17749,0.141867,-0.056186,-0.191882,0.061407,-0.08299,-0.219471,-0.081318,0.009564,0.032389
4,-0.301452,0.023367,0.342744,0.035723,0.339855,0.015488,-0.147399,0.256714,-0.117876,2.367558,...,-0.108582,0.013271,0.226074,-0.041728,-0.072668,0.107409,0.009078,0.085233,-0.12946,0.323597


In [23]:
final_w2v.shape

(509236, 300)

## PCA

In [26]:
from sklearn.decomposition import PCA

# keep 90% of original information
pca = PCA(n_components = 0.9)
pca_features = pca.fit_transform(np.array(final_w2v))

In [30]:
pca_features

array([[ 0.42105335,  0.54087878, -0.44685801, ...,  0.07144829,
         0.13298015, -0.06865743],
       [ 0.09923412, -0.80610939,  0.17282577, ...,  0.14860976,
        -0.18306629, -0.04259061],
       [ 1.831494  , -0.26640716,  0.20388979, ...,  0.17550519,
        -0.0083703 ,  0.07863244],
       ...,
       [ 0.22043437, -0.09160902,  0.22031299, ..., -0.16984756,
        -0.06932973, -0.04581706],
       [ 0.35408339,  0.5881052 ,  0.4024719 , ..., -0.07567547,
         0.11220458,  0.18305044],
       [ 0.83183711,  0.58135148, -0.86403248, ..., -0.06194569,
         0.09968691, -0.07630058]])