In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
#create a list ahead of time to append each article into
texts = []
#open each file, the .. representing the folder 1 step outward from the current directory and the r representing read
with open ('../Data/aljazeera-khashoggi.txt', 'r') as f:
    alj = f.readlines()
    #append by [0] so that the product is formatted properly
    texts.append(alj[0])
with open ('../Data/bbc-khashoggi.txt', 'r') as f:
    bbc = f.readlines()
    texts.append(bbc[0])
#ignore errors because the normal format is not allowing text to be written in, some sort of strange symbol must be present
with open ('../Data/breitbart-khashoggi.txt', 'r', errors='ignore') as f:
    bbt = f.readlines()
    texts.append(bbt[0])
with open ('../Data/cnn-khashoggi.txt', 'r') as f:
    cnn = f.readlines()
    texts.append(cnn[0])
with open ('../Data/fox-khashoggi.txt', 'r', errors='ignore') as f:
    fox = f.readlines()
    texts.append(fox[0])
with open('../Data/stop_words.csv') as stop:
    reader = csv.reader(stop)
    stopwords = []
    #other articles are written along the first line, stopwords needs to be looped since it is written a single word per row in the csv file
    for row in reader:
        stopwords.append(row[0])

In [4]:
def tokenize(text=None):
    #make all letter lower case
    text = text.lower()
    #remove irrelevant grammar symbols, include ? since it may connect with impliations in the writing
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('!','')
    text = text.replace(';','')
    text = text.replace('/','')
    text = text.replace("'",'')
    #split along remaining spaces
    text_list = text.split()
    #remove things that are included in the stop words list
    text_list2 = [word for word in text_list if word not in stopwords]
    return text_list2
def convert_text_to_dtm(txt):
    #Convert text into DTM which counts frequency of words used
    d = dict()
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)
def gen_DTM(texts=None):
    #generate DTM of a given file
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM
#create cosine function that will check for degree of similarity between two vectors (the vectors being the DTMs)
def cosine(a,b):
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))
    return cos

In [14]:
dtm = gen_DTM(texts)
#define each row in the DTM so that it can be more easily computed
alj = dtm.iloc[0].values
bbc = dtm.iloc[1].values
bbt = dtm.iloc[2].values
cnn = dtm.iloc[3].values
fox = dtm.iloc[4].values

In [17]:
cosine(bbt,fox)

0.5157347164850866

In [8]:
cosine(cnn, fox)

0.5137691572405918

In [9]:
cosine(bbt,cnn)

0.3527237380899392

It appears that the degree of similarity between Breitbart and Fox was relatively comparable to the degree of Similarity between CNN and fox. Despite the similarity there, a direct comparison of Breitbart and CNN demonstrates a much lower degree of similarity in terms their respective DTMs.

In [26]:
cosine(fox, bbc)

0.5993412925976285

In [27]:
cosine(fox, alj)

0.6337697128800454

In [21]:
cosine(alj, bbc)

0.6603550928778185

Fox did have a higher degree of similarity with both Aljazeera and BBC than with the other two but, unlike the previous case, Aljazeera and BBC demonstrate an even higher degree of similarity with one another.

In [20]:
cosine(alj, bbt)

0.541587378994062

In [23]:
cosine(alj, cnn)

0.5246192538797578

In [10]:
cosine(bbc,bbt)

0.5568714820991048

In [11]:
cosine(bbc,cnn)

0.4979090203443717

Aside from the BBC and CNN having the second lowest degree of similarity between articles the rest of these do no appear to be that significant.