# _Exploration: January 10, 2020_

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# import libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import random
import os

# Matplotlib
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

## _Load in Data_

In [10]:
# load in data
verified = pd.read_json("json-data/verified_train.json", orient="split", dtype={"id_str": str})
ira = pd.read_json("json-data/ira_train.json", orient="split", dtype={"id_str": str}).sample(n=len(verified), random_state=5)

# combine dfs above 
df = pd.concat([verified, ira])

In [11]:
# one hot encode label column
df = pd.get_dummies(df, columns=["label"])

In [12]:
# grab subset of data to experiment on
sample = (df[["id_str", "screen_name", "created_at", "full_text", "label_real"]]
          .sample(frac=0.50, random_state=5))

In [13]:
# reset index
sample.reset_index(drop=True, inplace=True)

## _Building tf-idf document vectors_

- some words occur more commonly across all documents
    - give more weight to the words based on account of exclusivity
- can help with stopwords, search and recommender systems
- also helps with performance
- Term frequency-inverse document frequency (TF-IDF)
    - proportional to term frequency
    - inverse function of the number of documents in which it occurs
    - higher weight --> more important in relaying information about document (more exclusive to that document)

In [15]:
# import TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer

# create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(sample["full_text"])

# print the shape of TFIDF matrix
print(tfidf_matrix.shape)

(168883, 221228)


### _Cosine similarity_

- the dot product --> consider two vectors, (v^n * w^n)
    - magnitude --> sqrt of v^n^2
- value is bounded between -1 and 1
    - NLP almost always use non-negative weights (value between 0-1)

In [16]:
# computing dot product --> initialize numpy vectors
A = np.array([1, 3])
B = np.array([-2, 2])

# compute the dot product 
dot_prod = np.dot(A, B)

# print dot product
print(dot_prod)

4


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# create test corpus
corpus = ['The sun is the largest celestial body in the solar system', 
          'The solar system consists of the sun and eight revolving planets', 
          'Ra was the Egyptian Sun God', 
          'The Pyramids were the pinnacle of Egyptian architecture', 
          'The quick brown fox jumps over the lazy dog']

# initialize TFIDF instance
tfidf_vectorizer = TfidfVectorizer()

# generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]


## _Building a plot line based recommender_

- Text preprocessing
- Generate tf-idf vectors
- generate cosine similarity matrix
- recommender function
    - take movie title, cosine similarity matrix and indices series as arguments
    - extract pairwise cosine similarity scores for the movie
    - sort the scores in descending order
    - output titles corresponding to the highest scores
    - ignore the highest similarity score
- linear_kernel function
    - magnitude of tf-idf vector is 1
    - cosine score between two tf-idf vectors is their dot product
    - can significantly improve computation time

In [19]:
# import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel

# generate cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.36413198 0.18314713 0.18435251 0.16336438]
 [0.36413198 1.         0.15054075 0.21704584 0.11203887]
 [0.18314713 0.15054075 1.         0.21318602 0.07763512]
 [0.18435251 0.21704584 0.21318602 1.         0.12960089]
 [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]


In [45]:
# create small subset of data
small_subset = pd.DataFrame(sample["full_text"].sample(n=10000, random_state=5)).reset_index(drop=True)

# initialize TFIDF instance
tfidf_vectorizer = TfidfVectorizer()

# generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(small_subset["full_text"])

In [46]:
import time
# record start time
start = time.time()

# compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# print cosine similarity matrix
print(cosine_sim)

# print time taken
print("Time taken: %s seconds" %(time.time() - start))

[[1.         0.01767314 0.00474097 ... 0.         0.03413309 0.02301195]
 [0.01767314 1.         0.00450859 ... 0.         0.02515418 0.05672451]
 [0.00474097 0.00450859 1.         ... 0.         0.00674782 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.03413309 0.02515418 0.00674782 ... 0.         1.         0.03275291]
 [0.02301195 0.05672451 0.         ... 0.         0.03275291 1.        ]]
Time taken: 3.372642755508423 seconds


In [47]:
# record start time
start = time.time()

# compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

[[1.         0.01767314 0.00474097 ... 0.         0.03413309 0.02301195]
 [0.01767314 1.         0.00450859 ... 0.         0.02515418 0.05672451]
 [0.00474097 0.00450859 1.         ... 0.         0.00674782 0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.03413309 0.02515418 0.00674782 ... 0.         1.         0.03275291]
 [0.02301195 0.05672451 0.         ... 0.         0.03275291 1.        ]]
Time taken: 3.24194598197937 seconds


In [48]:
# generate mapping between tweets and index
indices = pd.Series(small_subset.index, index=small_subset["full_text"]).drop_duplicates()

In [52]:
def get_recommendations(tweet, cosine_sim, indices):
    """
    Function that gets recommended Tweets based off an input Tweet.
    """
    # get index of tweet that matches title
    idx = indices[tweet]
    # sort the Tweets based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # get the scores for 10 most similar tweets
    sim_scores = sim_scores[1:11]
    # get the tweet indices
    tweet_indices = [i[0] for i in sim_scores]
    # return the top 10 most similar tweets
    return small_subset["full_text"].iloc[tweet_indices]

In [53]:
# grab single tweet from small_subset
single_tweet = small_subset.sample(n=1, random_state=5)["full_text"]
single_tweet.iloc[0]

'What I said on the phone call with the Ukrainian President is “perfectly” stated. There is no reason to call witnesses to analyze my words and meaning. This is just another Democrat Hoax that I have had to live with from the day I got elected (and before!). Disgraceful!'

In [55]:
# initialize TFIDF vectorizer
tfidf = TfidfVectorizer(stop_words="english")

# construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(small_subset["full_text"])

# generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# generate recommendations
recommends = get_recommendations(single_tweet.iloc[0], cosine_sim, indices)

In [56]:
print(single_tweet.iloc[0])

What I said on the phone call with the Ukrainian President is “perfectly” stated. There is no reason to call witnesses to analyze my words and meaning. This is just another Democrat Hoax that I have had to live with from the day I got elected (and before!). Disgraceful!


In [57]:
for recommend in recommends:
    print(recommend, "\n")

#FukushimaAgain The Ukrainian president doesn`t care about anything but money and chocolate!! I'm lost for words! 

you've got the phone and the number #happy 

@KatysMyHabibi @katystumblrr meaning I need more space on my phone to do stories cause I'm a secret hoarder. 

Live fast, live long, always get the foke last word Live hard, live strong, we got the password We opening up the streets, we turning... 

Just do that 

No, I just can’t get over you. 

Another day, another #Senate Democrat #SCOTUS Fact Check. https://t.co/HLzfWQZIfc 

To many Americans, Memorial Day has lost its meaning https://t.co/wHoEfxF8sM https://t.co/T8bYQeBL77 

@AVAETC was on the phone with @tylerperry . Said he had to get off to go tweet with Ava. #ARRAY 

#Chernobyl2015 #FukushimaAgain Ukrainian president is a fool!! Tosh! 

