In [106]:
#Import relevant packages and load bokeh module
import numpy as np
import pandas as pd
import requests

In [60]:
#Read in relevant text and csv files, and convert the stop words to a list
aljz = open('../Data/aljazeera-khashoggi.txt', 'r', encoding = 'UTF-8').read()
bbc = open('../Data/bbc-khashoggi.txt', 'r', encoding = 'UTF-8').read()
bbt = open('../Data/breitbart-khashoggi.txt', 'r', encoding = 'UTF-8').read()
cnn = open('../Data/cnn-khashoggi.txt', 'r', encoding = 'UTF-8').read()
fox = open('../Data/fox-khashoggi.txt', 'r', encoding = 'UTF-8').read()
stop_words = pd.read_csv('../Data/stop_words.csv')
stop_words = stop_words['word'].tolist()

In [61]:
#Now to use a lot of functions from the trigonometry-of-vectors workbook
#First, a function to tokenize each text file
def tokenize(text=None):
    
    '''
    Converts a long string into all lowercase, removes punctuation, and spits the string into individual substrings.
    Removes any specified stop words. Then returns a list of substrings.
    Arguments: text = A string or text
    '''
    
    text = text.lower()
    text = text.replace('.','')
    text_list = text.split()
    text_list2 = [word for word in text_list if word not in stop_words]
    return text_list2

In [62]:
#Next, create a list of files, tokenize each, convert each file into a dictionary, and then into a document term matrix
articles_tokenized = [aljz, bbc, bbt, cnn, fox]

def convert_text_to_dtm(text):

    '''
    Converts a string or list of strings into a document term matrix (DTM), where column names are substrings 
    and column values are substring frequencies. Then returns the DTM as a pandas data frame.
    Arguments: text = A string or text
    '''
    
    d = dict()
    for word in tokenize(text):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

def gen_DTM(texts=None):
    
    '''
    Converts each string in a list of strings into a document term matrix (DTM), then appends it to a larger matrix.
    Fills in any missing values with 0, and returns the DTM as a multi-row pandas data frame.
    Arguments: texts = A list of strings or texts
    '''
    
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

In [63]:
#Next, create an accessor for that data frame, and then assign each row (1 row = 1 article) to a vector
DTM = gen_DTM(articles_tokenized)
aljz_vec = DTM.iloc[0].values
bbc_vec = DTM.iloc[1].values
bbt_vec = DTM.iloc[2].values
cnn_vec = DTM.iloc[3].values
fox_vec = DTM.iloc[4].values

In [90]:
#The directions said not to just copy the cosine calculation function. I hope this is an acceptable reworking. 
    
def cosine(a,b):
    
    '''
    Takes two vectors and calculates the cosine of the angle between them.
    Arguments: a = a vector
               b = a vector
    '''
    a_dot = np.dot(a,a)
    b_dot = np.dot(b,b)
    ab_dot = np.dot(a,b)
    cos = ab_dot/(np.sqrt(a_dot)*np.sqrt(b_dot))
    return(cos)

In [91]:
cosine(aljz_vec, bbc_vec)

0.6362638437556203

In [92]:
cosine(aljz_vec, bbt_vec)

0.5252670250259308

In [93]:
cosine(aljz_vec, cnn_vec)

0.5001486244449631

In [94]:
cosine(aljz_vec, fox_vec)

0.6038829965072295

In [95]:
cosine(bbc_vec, bbt_vec)

0.5335823775905203

In [96]:
cosine(bbc_vec, cnn_vec)

0.4701307151063014

In [97]:
cosine(bbc_vec, fox_vec)

0.5747346493651461

In [98]:
cosine(bbt_vec, cnn_vec)

0.3250974994371181

In [99]:
cosine(bbt_vec, fox_vec)

0.4955390278235966

In [100]:
cosine(cnn_vec, fox_vec)

0.49054506610222515

In [101]:
#Just for fun, I am going to find the mean similarity for each text. This is the mean for Al-Jazeera
(cosine(aljz_vec, bbc_vec)+cosine(aljz_vec, bbt_vec)+cosine(aljz_vec, cnn_vec)+cosine(aljz_vec, fox_vec))/4

0.566390622433436

In [102]:
#Mean for the BBC
(cosine(aljz_vec, bbc_vec)+cosine(bbc_vec, bbt_vec)+cosine(bbc_vec, cnn_vec)+cosine(bbc_vec, fox_vec))/4

0.553677896454397

In [103]:
#Mean for Breitbart
(cosine(bbt_vec, bbc_vec)+cosine(aljz_vec, bbt_vec)+cosine(bbt_vec, cnn_vec)+cosine(bbt_vec, fox_vec))/4

0.46987148246929145

In [104]:
#Mean for CNN
(cosine(cnn_vec, bbc_vec)+cosine(cnn_vec, bbt_vec)+cosine(aljz_vec, cnn_vec)+cosine(cnn_vec, fox_vec))/4

0.44648047627265197

In [105]:
#Mean for Fox
(cosine(fox_vec, bbc_vec)+cosine(fox_vec, bbt_vec)+cosine(fox_vec, cnn_vec)+cosine(aljz_vec, fox_vec))/4

0.5411754349495493

#### Observations:

-Al-Jazeera is most similar to the BBC, and most dissimilar to CNN.

-The BBC is most similar to Al-Jazeera, and most dissimilar to CNN.

-Breitbart is most similar to the BBC, and most dissimilar to CNN.

-CNN is most similar to Al-Jazeera, and most dissimilar to Breitbart.

-Fox is most similar to Al-Jazeera, and most dissimilar to CNN.

The cosine values range from a low of 0.325 between Breitbart and CNN to a high of 0.636 between Al-Jazeera and the BBC. The fact that CNN is the least similar of all of these texts suggests that the CNN reporting either 1) contains information that the other four do not, or 2) is missing information contained in the other four. I also calculated the mean similarity for each text. Al-Jazeera, the BBC, and Fox all have a mean similarity of about 0.55, while Breitbart and CNN have a mean similarity of about 0.45. I predict that expanding the list of stop words would decrease the cosine values. Adding more stop words would likely reduce the number of words in common between texts, which would decrease the dot product of the two vectors and decrease the cosine.