# Intro to Cosine Similarity

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt

<font color = 'green'> __Cosine similarity:__

## Application: Similarity of website visitors

<font color = 'green'> __Sparse vector:__ 

In [None]:
with open('msweb.pkl', 'rb') as fp:
    msweb = pickle.load(fp)

This is anonymized data of visitors to `microsoft.com` collected over one week in February 1998. Each key in the dictionary is an ID assigned to a visitor; each value is a list of website areas that user visited during the one-week period.

Each of these lists can be regarded as a vector in a high-dimensional space.

In [None]:
msweb

In [None]:
areas = set()
for idnum in msweb:
    areas = areas.union(set(msweb[idnum]))
len(areas)

In [None]:
np.average([len(l) for l in msweb.values()])

The vectors are quite sparse: the average user visited about 3 website areas out of the almost 300 areas available. So, if we compare entry-by-entry, most vectors will look pretty similar, because almost all entries for both vectors will be 0.

Cosine similarity only considers coordinates where at least one vector has a nonzero entry -- this makes it good for handling vectors in a high-dimensional space.

In [None]:
def cos_sim(id1, id2):
    dot = len(set.intersection(set(msweb[id1]), set(msweb[id2])))
    return dot / (np.sqrt(len(msweb[id1])) * np.sqrt(len(msweb[id2])))

In [None]:
plt.hist([cos_sim('10190', id) for id in msweb.keys()], bins = np.linspace(0, 1, 21))

In [None]:
plt.plot([int(k) for k in msweb.keys()],
         [cos_sim('10190', k) for k in sorted(msweb.keys(), key = lambda x:cos_sim('10190', x))], ',')

<font color = 'green'> __Example extension:__ <font color = 'red'> 

## Application: Similarity of documents

In [None]:
import json
import string
import random
from sklearn.feature_extraction import stop_words
from nltk import SnowballStemmer

<font color = 'green'> __Bag of words:__

<font color = 'green'> __Stop words:__

<font color = 'green'> __Stemming:__

In [None]:
# @realDonaldTrump tweets, not including retweets/likes, from 1/21/2017 to 3/30/2020
with open('trump_tweets.json') as fp:
    tweets = json.load(fp)

In [None]:
# What devices did tweets come from?
set([tweet['source'] for tweet in tweets])

Can we tell the difference, from a data mining perspective, between tweets from different devices? Let's look at iPhone vs. Android.

In [None]:
iphone_tweets = [tw for tw in tweets if tw['source'] == 'Twitter for iPhone']
android_tweets = [tw for tw in tweets if tw['source'] == 'Twitter for Android']

In [None]:
# A somewhat modified version of vectorize() from HW5
def vectorize(text, stop, stemmer):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    words = text.split()
    words = [w for w in words if w not in stop]
    words = [stemmer.stem(w) for w in words]
    counts = {}
    for word in words:
        word = stemmer.stem(word)
        if word not in stop:
            if word in counts:
                counts[word] += 1
            else:
                counts[word] = 1
    return counts

One problem: an individual tweet doesn't contain enough words to be a good basis for comparison. Let's pool the contents of a random sample.

In [None]:
iphone_sample = random.sample(iphone_tweets, 50)
android_sample = random.sample(android_tweets, 50)

In [None]:
iphone_text = ' '.join([tw['text'] for tw in iphone_sample])
android_text = ' '.join([tw['text'] for tw in android_sample])

In [None]:
vec = vectorize(iphone_text, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))

In [None]:
# What are the most common words?
sorted(vec, key = lambda k:vec[k], reverse = True)

<font color = 'green'> __What data cleaning should we consider?__ <font color = 'red'> **Looking toward the end, we see some links, which should be removed; possibly keywords such as RT (although what's the potential problem here?); possibly hashtags; possibly numbers?**

In [None]:
vec_a = vectorize(android_text, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))

In [None]:
# What are the most common words?
sorted(vec_a, key = lambda k:vec_a[k], reverse = True)

Can we notice any differences between the words that appear most often in each sample?

In [None]:
def dot_product(u, v):
    total = 0
    for word in u:
        if word in v:
            total += u[word] * v[word]
    return total

def magnitude(u):
    return np.sqrt(dot_product(u, u))

def cosine_similarity(u,v):
    return dot_product(u, v) / (magnitude(u) * magnitude(v))

In [None]:
iphone_vec = vectorize(iphone_text, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
android_vec = vectorize(android_text, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
cosine_similarity(iphone_vec, android_vec)

In [None]:
iphone_sample_1 = random.sample(iphone_tweets, 50)
android_sample_1 = random.sample(android_tweets, 50)
iphone_sample_2 = random.sample(iphone_tweets, 50)
android_sample_2 = random.sample(android_tweets, 50)

iphone_text_1 = ' '.join([tw['text'] for tw in iphone_sample_1])
android_text_1 = ' '.join([tw['text'] for tw in android_sample_1])
iphone_text_2 = ' '.join([tw['text'] for tw in iphone_sample_2])
android_text_2 = ' '.join([tw['text'] for tw in android_sample_2])

ivec_1 = vectorize(iphone_text_1, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
ivec_2 = vectorize(iphone_text_2, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
avec_1 = vectorize(android_text_1, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
avec_2 = vectorize(android_text_2, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))

In [None]:
print(cosine_similarity(ivec_1, avec_1))
print(cosine_similarity(ivec_2, avec_2))
print(cosine_similarity(ivec_1, ivec_2))
print(cosine_similarity(avec_1, avec_2))

In [None]:
similarities = np.zeros((200, 4))
for i in range(200):
    
    iphone_sample_1 = random.sample(iphone_tweets, 50)
    android_sample_1 = random.sample(android_tweets, 50)
    iphone_sample_2 = random.sample(iphone_tweets, 50)
    android_sample_2 = random.sample(android_tweets, 50)

    iphone_text_1 = ' '.join([tw['text'] for tw in iphone_sample_1])
    android_text_1 = ' '.join([tw['text'] for tw in android_sample_1])
    iphone_text_2 = ' '.join([tw['text'] for tw in iphone_sample_2])
    android_text_2 = ' '.join([tw['text'] for tw in android_sample_2])

    ivec_1 = vectorize(iphone_text_1, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
    ivec_2 = vectorize(iphone_text_2, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
    avec_1 = vectorize(android_text_1, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
    avec_2 = vectorize(android_text_2, stop_words.ENGLISH_STOP_WORDS, SnowballStemmer("english"))
    
    similarities[i, 0] = cosine_similarity(ivec_1, avec_1)
    similarities[i, 1] = cosine_similarity(ivec_2, avec_2)
    similarities[i, 2] = cosine_similarity(ivec_1, ivec_2)
    similarities[i, 3] = cosine_similarity(avec_1, avec_2)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3,1, figsize = (6, 12))
ax1.hist(np.concatenate((similarities[:,0], similarities[:,1])), bins = np.arange(0.2, 0.875, 0.025), edgecolor = 'black')
ax1.set_title('Similarities: iPhone vs. Android')
ax2.hist(similarities[:,2], bins = np.arange(0.2, 0.875, 0.025), edgecolor = 'black')
ax2.set_title('Similarities: iPhone vs. iPhone')
ax3.hist(similarities[:,3], bins = np.arange(0.2, 0.875, 0.025), edgecolor = 'black')
ax3.set_title('Similarities: Android vs. Android')
plt.show()

**Conclusions?**