1. Find the **most similar sentences or documents in your dataset using word count, TF-IDF, and word-embeddings** as your vectorization techniques. If the computation is slow, **you may subsample** for only a few thousand rows. (2 pts)



In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("reddit_worldnews_start_to_2016-11-22.csv")

In [4]:
data.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [5]:
len(data["title"])

509236

## Preprocessing

In [30]:
small_sample = data.sample(n = 5000)

In [31]:
small_sample = small_sample.reset_index(drop=True)

In [32]:
titles = small_sample["title"].values

In [33]:
titles

array(['Spanish gang steals tonne of hashish at midnight on New Year s Eve',
       'Typhoon Haiyan was the strongest storm to ever make landfall, 4th strongest ever recorded.',
       'Lonely Planet Challenged by an Innovative Travel Startup -- OurExplorer',
       ...,
       'Six Italian seismologists and one government official will be tried for the manslaughter of those who died in an earthquake that struck the city of L Aquila on 6 April 2009.',
       'Syria suffers  deadlist month . Syria has suffered its deadliest month so far in the two-year conflict, with more than 6,000 people killed in March, a third of them civilians',
       'The Internet Indians'], dtype=object)

In [34]:
# Process the reviews (tokenizing, lemmatization, removing stopwords)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()
filtered_titles = []

for title in titles:
    title = title.lower()
    token_list = word_tokenize(title) # Tokenize
    filtered_token = [t for t in token_list if not t in stop_words] # Remove stopwords
    for i in range(len(filtered_token)):
        filtered_token[i] = lemmatizer.lemmatize(filtered_token[i]) # Lemmatization
    filtered_titles.append(" ".join(filtered_token))

In [35]:
filtered_titles

['spanish gang steal tonne hashish midnight new year eve',
 'typhoon haiyan strongest storm ever make landfall , 4th strongest ever recorded .',
 'lonely planet challenged innovative travel startup -- ourexplorer',
 'picture young saudi female model posing front pavilion luxurious car company local show went viral social medium , prompting punitive action authority .',
 'colombia legalize commercial sale medical marijuana',
 'north korea blow nuke cooling tower ( photo )',
 'italy crisis : president napolitano agrees stand',
 'airbnb drop almost half berlin listing city crack short-term rental imposing fine €100,000 property owner rent vacation home without permit',
 'brazil fan stunned lopsided world cup defeat ; turn sarcastic',
 'look like somebody notice . . . . dennis nedry wrong . . . .',
 'pope francis turkey promote religious tolerance',
 'russian threatening crimean tatar opposed russian rule',
 'ukraine : tour barricade interview people , english',
 'danish premier defy eu to

## Word Count

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# vectorize the corpus
vector = vectorizer.fit_transform(filtered_titles)

In [37]:
# load vectorized corpus into Pandas dataframe
corpus_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())

In [38]:
from scipy.spatial.distance import cosine
import numpy as np

pairs = {}
for i in range(len(small_sample["title"]) - 1):
    pairs[small_sample["title"][i] + " VS. " + small_sample["title"][i+1]] = \
    1 - cosine(corpus_df.iloc[i], corpus_df.iloc[i + 1])

for j in pairs.keys():
    if np.isnan(pairs[j]):
        pairs[j] = 0

In [39]:
# Find two titles that are the most "similar" based upon cosine similarity
import operator
max(pairs.items(), key=operator.itemgetter(1))

('U.S.-led airstrikes target Islamic State leaders in Iraq: report VS. Baghdad s National Museum of Iraq re-opens in rebuke to Islamic State',
 0.35355339059327373)

## TF-IDF

In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

vectorizer = TfidfVectorizer(ngram_range=(1,2))

In [108]:
vector = vectorizer.fit_transform(filtered_titles)

In [109]:
corpus_df = pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names())

In [110]:
from scipy.spatial.distance import cosine
import numpy as np

pairs = {}
for i in range(len(small_sample["title"]) - 1):
    pairs[small_sample["title"][i] + " VS. " + small_sample["title"][i+1]] = \
    1 - cosine(corpus_df.iloc[i], corpus_df.iloc[i + 1])

for j in pairs.keys():
    if np.isnan(pairs[j]):
        pairs[j] = 0

In [111]:
# Find two titles that are the most "similar" based upon cosine similarity
import operator
max(pairs.items(), key=operator.itemgetter(1))

('Pussy Riot attacked with whips by Cossack militia at Sochi Olympics VS. Putin: Gays, lesbians welcome in Sochi for Olympics',
 0.21582148635893628)