In [4]:
import pandas as pd
import numpy as np
import os

In [5]:
# put all file names into a list to iterate through for easier file management
news = ['aljazeera-khashoggi.txt', 'bbc-khashoggi.txt', 'breitbart-khashoggi.txt', 
        'cnn-khashoggi.txt', 'fox-khashoggi.txt']

# create a new list to store the full texts in
all_stories = []

# move from the submissions folder to the base folder to then access /Data
path_parent = os.path.dirname(os.getcwd())
os.chdir(path_parent)

# for each file
for file in news:
    # navigate to the correct folder and file and open
    with open("Data/"+file,mode='rt',encoding='UTF-8') as open_file:
        # read lines and add to the new list
        all_stories.append(open_file.readlines())
        
stop_words = pd.read_csv("Data/stop_words.csv")

In [6]:
# 
def cosine(v1,v2):
    '''
    Calcuates the cosine of the angle between two vectors by taking their dot product
    divided by the multiplied magnitude of each vector
    '''
    cos = np.dot(v1,v2)/(np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)))
    return cos

In [7]:
def tokenize(text=None):
    '''
    Removes punctuation and splits the text into a list of individual words
    '''
    text = text.lower()
    text = text.replace('.','')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('"','')
    text = text.replace("'",'')
    text = text.replace('?','')
    text = text.replace('!','')
    text = text.replace('\,','')
    text = text.replace('“','')
    text = text.replace('-','')
    text = text.replace(',','')
    text_list = text.split()
    return text_list


def convert_text_to_dtm(txt):
    '''
    Converts text into a document term matrix.
    '''
    dt = dict()
    # pass the argument into the tokenize function
    # and for each word, count the frequency
    for word in tokenize(txt):
        if word in dt:
            dt[word][0] += 1
        else:
            dt[word] = [1]
    return pd.DataFrame(dt)


def build_DTM(stories=None):
    '''
    Generate a document term matrix
    '''
    DTM = pd.DataFrame()
    # for each story, pass the full text into convert_text_to_dtm
    # which in turn tokenizes it and creates a single row of the data term matrix
    # then append it to the dataframe this function returns
    for story in stories:
        entry = convert_text_to_dtm(story)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True)
    
    DTM.fillna(0, inplace=True)
    return DTM


In [8]:
# call DTM on a list of the string elements from the list of stories  
DTM = build_DTM([all_stories[0][0], all_stories[1][0], all_stories[2][0],
                 all_stories[3][0], all_stories[4][0]]) 
DTM.shape
# number of stories, number of unique words

(5, 909)

In [9]:
# create a dictionary of the source and their DTM values for easier reference
dt2 = {
'aljazeera':DTM.iloc[0].values,
'bbc':DTM.iloc[1].values,
'breitbart':DTM.iloc[2].values,
'cnn':DTM.iloc[3].values,
'fox':DTM.iloc[4].values
} 

# create a dataframe in which the rows and columns for a table of all news source combinations
stories = dt2.keys()
cos_table = pd.DataFrame(columns=stories, index=stories)

cos_table

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,,,,,
bbc,,,,,
breitbart,,,,,
cnn,,,,,
fox,,,,,


In [10]:
def build_cos_table(table=pd.DataFrame, dt={}):
    '''
    takes a dataframe with dictionary keys as the column and row indices and a dictionary
    with the stored values for these assigned keys.
    for each row in the dataframe, calculate the cosine of the news source represented in each column
    by accessing the stored values in the dictionary as inputs for the cosine function.
    '''
    i = 0
    while i < len(table.columns):
        if i < len(table.columns):
            table.iloc[i,i] = cosine(dt.get(table.index[i]),dt.get(table.index[i]))
            
        if i+1 < len(table.columns):
            table.iloc[i,i+1] = cosine(dt.get(table.index[i]),dt.get(table.index[i+1]))
            table.iloc[i+1,i] = cosine(dt.get(table.index[i]),dt.get(table.index[i+1]))

        if i+2 < len(table.columns):    
            table.iloc[i,i+2] = cosine(dt.get(table.index[i]),dt.get(table.index[i+2]))
            table.iloc[i+2,i] = cosine(dt.get(table.index[i]),dt.get(table.index[i+2]))

        if i+3 < len(table.columns):    
            table.iloc[i,i+3] = cosine(dt.get(table.index[i]),dt.get(table.index[i+3]))
            table.iloc[i+3,i] = cosine(dt.get(table.index[i]),dt.get(table.index[i+3]))

        if i+4 < len(table.columns):    
            table.iloc[i,i+4] = cosine(dt.get(table.index[i]),dt.get(table.index[i+4]))
            table.iloc[i+4,i] = cosine(dt.get(table.index[i]),dt.get(table.index[i+4]))

        i += 1
    return table

In [11]:
cos_table = build_cos_table(cos_table, dt2)
cos_table

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.868733,0.831982,0.736701,0.839099
bbc,0.868733,1.0,0.897022,0.744041,0.887035
breitbart,0.831982,0.897022,1.0,0.67868,0.867276
cnn,0.736701,0.744041,0.67868,1.0,0.737826
fox,0.839099,0.887035,0.867276,0.737826,1.0


In [12]:
# store the CSV of stop words into a list
stop = []
for word in stop_words['word']:
    stop.append(word)

In [13]:
# drop every column (word) that appears in the list of stop words
DTM = DTM.drop(columns=[col for col in DTM if col in stop])
DTM.shape

(5, 684)

In [14]:
# with the newly updated DTM, create a new dictionary and dataframe to compare cosine tables

dt_post = {
'aljazeera':DTM.iloc[0].values,
'bbc':DTM.iloc[1].values,
'breitbart':DTM.iloc[2].values,
'cnn':DTM.iloc[3].values,
'fox':DTM.iloc[4].values
} 

cos_table_post = pd.DataFrame(columns=stories, index=stories)

cos_table_post = build_cos_table(cos_table_post, dt_post)
cos_table_post

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.675688,0.565313,0.532856,0.670649
bbc,0.675688,1.0,0.574092,0.503919,0.624625
breitbart,0.565313,0.574092,1.0,0.357928,0.533551
cnn,0.532856,0.503919,0.357928,1.0,0.521914
fox,0.670649,0.624625,0.533551,0.521914,1.0


## Observations 



In the initial matrix of cosine similarity results calculated prior to stop words being removed, each unique news story is relatively similar, with most similarity scores in the range of .7 to .8, generally speaking. Even the most disparate sources, Brietbart and CNN with a cosine similarity of .67 appear to have a great deal of overlap.

When the stop words are removed from the DTM, the disparity between each of the sources is heightened significantly. The previous similarity of .67 drops to .35 for Breitbart and CNN. Interestingly, the stories from Aljazeera, BBC, and Fox all seem to share similarity scores with their respective other 4 sources that all range from .50 to .67, while CNN and Brietbart's ranges are both from .35 to .57.

These overall trends suggest that, especially when considering stop word removal, the former three sources may, when it comes to this news event at least, have written pieces that were broadly more similar in comparison to Brietbart and CNN, which seem to have put a more unique perspective to their coverage.