In [144]:
# Imports
import pandas as pd
import numpy as np

In [145]:
# functions used

def tokenize(text=None):
    '''
    This function simplifies a string and returns a list of strings such that it can be quantitatively compared with other strings.
    
    Input:
    text - String to be simplified and split
    
    Output:
    text_list - A list of words, as strings, from the original text
    '''
    
    # Turns all characters to lowercase and removes all puntuations and contractions
    text = text.lower()
    text = text.replace('.','')
    text = text.replace(",", "")
    text = text.replace("\"", "")
    text = text.replace("'s", "")
    text = text.replace("'", "")
    text = text.replace("-", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("[", "")
    text = text.replace("]", "")
    text = text.replace("{", "")
    text = text.replace("}", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("—", "")
    text = text.replace("?", "")
    text = text.replace(";", "")
    
    # Splits the string into words
    text_list = text.split()
    
    # Removes stop words
    text_list2 = [word for word in text_list if word not in stopwords]
    
    # Returns the list of words
    return text_list2

In [146]:
def convert_text_to_dtm(txt):
    '''
    Converts text into a document term matrix.
    
    Input: 
    text - List of strings representing an article
    
    Outputs:
    DTM - Document Term Matrix - Contains information on the number of times a word appears in a document
    '''
    d = dict()
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [147]:
# A function which generates a matrix of document terms
def gen_DTM(texts=None):
    '''
    Generate a document term matrix
    
    Input: 
    texts - List of lists of strings, each sublist representing an article
    
    Outputs:
    DTM - Document Term Matrix - Contains information on the number of times a word appears in each document
    '''
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

In [148]:
# Calculates the cosine similarity between two vectors
def cosine(a,b):
    '''
    Takes two vectors and determines their level of similarity on a scale of 0 to 1 
    
    Input:
    a, b - Both vectors (ie arrays of numbers)
    
    Output:
    cos - A cosine function, calculated using the dot products of each vector
    
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))  )
    return cos

In [149]:
# Creates an nxn matrix of how similar a set of articles are
def simMatrix(df):
    '''
    Creates what is essentially a correlation matrix between vectors
    
    Input:
    df - A dataframe where all entries are numerical
    
    Output:
    df_Sim - A correlation matrix, showing the similarities between each vector
    
    '''
    df_Sim = pd.DataFrame(columns = df.index.values, index = df.index.values)
    
    for col in df_Sim.columns:
        for row in df_Sim.index.values:
            df_Sim.loc[row, col] = cosine(df.loc[row].values, df.loc[col].values)
            
    return(df_Sim)

# Import, Clean, and Preprocess the Data

In [150]:
# Imports the news stories
aj = open("../Data/aljazeera-khashoggi.txt", "r", encoding = "UTF-8").read()
bbc = open("../Data/bbc-khashoggi.txt", "r", encoding = "UTF-8").read()
bart = open("../Data/breitbart-khashoggi.txt", "r", encoding = "UTF-8").read()
cnn = open("../Data/cnn-khashoggi.txt", "r", encoding = "UTF-8").read()
fox = open("../Data/fox-khashoggi.txt", "r", encoding = "UTF-8").read()

In [151]:
# Imports the stop words
stop_words = pd.read_csv("../Data/stop_words.csv")

In [152]:
# Extracts stop words as a list of strings
stopwords = stop_words["word"].values

# Or you can set stopwords to this more minimal list
#stopwords = ['on', 'to', 'go', 'at', 'the','that','of','was', 'and', 'by']

In [153]:
# Counts how many times a word appears in each article and stores as a DTM
df = gen_DTM([aj, bbc, bart, cnn, fox])

In [154]:
df = df.rename(index={0:"aljazeera", 1:"bbc", 2:"breitbart", 3:"cnn", 4:"fox"})

# Create the Correlation Matrix

In [155]:
simMatrix(df)

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.734133,0.658305,0.603997,0.693933
bbc,0.734133,1.0,0.706049,0.66703,0.717462
breitbart,0.658305,0.706049,1.0,0.550967,0.64952
cnn,0.603997,0.66703,0.550967,1.0,0.646122
fox,0.693933,0.717462,0.64952,0.646122,1.0


# Discussion

**Answer:** BBC and Aljazeera have the most similar reporting, with about 70% similarity between their articles. This makes sense, being the two non-United States based news organizations in this list. The BBC also reports moderately similarly to Fox, at 67%, or 2/3rds similar words. The most different reporting comes from CNN and Breitbart, both scoring in the 50%'s on similarity with other news organizations and even lower compared to one another. This was done removing all possible words from the `stopwords` data.

When using the more minimal set of stopwords, the similarity between all news sites rises, as expected. In particular, Fox and BBC are now almost as related as Fox and Aljazeera.