# Comparing Similarities Among News Outlets Of The Khashoggi Murder

# Importing Important Packages

In [1]:
import numpy as np
import pandas as pd

# Reading The Files

In [48]:
# Reading the files
# Use the ../ to indicate that computer has to search for Data folder
alj = open("../Data/aljazeera-khashoggi.txt", 'r', encoding = "UTF-8").read()
bbc = open("../Data/bbc-khashoggi.txt", "r", encoding = "UTF-8").read()
breit = open("../Data/breitbart-khashoggi.txt", "r", encoding = "UTF-8").read()
cnn = open("../Data/cnn-khashoggi.txt", "r", encoding = "UTF-8").read()
fox = open("../Data/fox-khashoggi.txt", "r", encoding = "UTF-8").read()
stopWords = pd.read_csv("../Data/stop_words.csv", encoding = "UTF-8")
stopWordsList = stopWords["word"].values

# If you wanted to use a smaller set of stopwords
# Words common to the English Language
#stopWords = ['on', 'to', 'go', 'at', 'the','that','of','was', 'and', 'by']

# Necessary Functions/Methods

In [126]:
def tokenize(text=None):
    '''
    Tokenizes a text
    @param text the specified text
    @return a list of words from the text
    '''
    text = text.lower()
    
    # Eradicate all the non-characters
    text = text.replace('.','')
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("-", "")
    text = text.replace("[", "")
    text = text.replace("]", "")
    
    # Eradicate punctuation marks too...
    text = text.replace("?", "")
    text = text.replace(",", "")
    text = text.replace("!", "")
    text = text.replace("\"", "")
    text = text.replace("’s", "")
    text = text.replace("’", "")
    text = text.replace("\'", "")
    text = text.replace("{", "")
    text = text.replace("}", "")
    text = text.replace("\n", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("—", "")
    text = text.replace(";", "")
    
    return text
#     #Then split by spaces
#     text_list = text.split()
#     text_list2 = [word for word in text_list if word not in stopWordsList]
#     return text_list2


def tokenizeToList(text):
    '''
    Tokenizes the text into a list
    @param text the cleaned, parsed text
    @return the list of tokenized, cleaned, parsed words
    '''    
    #Then split by spaces
    text_list = text.split()
    text_list2 = [word for word in text_list if word not in stopWordsList]
    return text_list2


In [127]:
# Tokenize the words, but don't convert them to lists yet
aljWords = tokenize(alj)
bbcWords = tokenize(bbc)
breitWords = tokenize(breit)
cnnWords = tokenize(cnn)
foxWords = tokenize(fox)

We now have a bunch of words from each text file. Let's now convert these texts into a document term matrix.

In [128]:

def toDictionary(txt):
    '''
    Converts text into a document term matrix.
    @param txt the text
    @return the document term matrix, to be used in a dataframe
    '''    
    # Spawn a dictionary
    d = dict()
    
    # Tokenize (toList) the text
    #for word in tokenize(txt):
    for word in tokenizeToList(txt):
        
        # if the word is in the dictionary, give it a 1
        if word in d:
            d[word][0] += 1
        
        # Otherwise, give it a value of 1. First time encountering this word
        else:
            d[word] = [1]
    return pd.DataFrame(d)


def toDTM(texts = None):
    '''
        Converts a list of texts into a document term matrix
        @param texts a list of words
        @return the document term matrix
    '''    
    # Spawn a data frame
    DTM = pd.DataFrame()
    
    # For every word in the list of words
    for text in texts:
        # convert to a dictionary
        entry = toDictionary(text)
        # Append to the data frame document term matrix
        DTM = DTM.append(pd.DataFrame(entry), ignore_index = True, sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

# Name things the Java way, like a BOSS!
DTM = toDTM([aljWords, bbcWords, breitWords, cnnWords, foxWords])

Now let us find some disimilarities and similarities among the 5 different news reporter.

In [131]:

def cosine(a,b):
    '''
    Calculates how related (or unrelated) to vectors are
    by calculating the cosine of the angle between them
    @param a a matrix/vector/column
    @param b another matrix/vector/column
    @return the cosine of the angle between the 2 vectors
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))  )
    return round(cos, 3)

# Comparing The Different Newspaper Reports

In [117]:
# Let's loop through all the different rows
# Or, through all the 5 different news reporter
reporterList = ["Aljazaar", "BBC", "Breitbart", "CNN", "Fox"]
for i in range(5):
    for j in range(i + 1, 5):
        
        print("i = ", reporterList[i], ", j = ", reporterList[j], ", and cosine:", round(cosine(DTM.iloc[i,:], DTM.iloc[j, :]), 3))
        #counter = counter + 1

i =  Aljazaar , j =  BBC , and cosine: 0.679
i =  Aljazaar , j =  Breitbart , and cosine: 0.588
i =  Aljazaar , j =  CNN , and cosine: 0.533
i =  Aljazaar , j =  Fox , and cosine: 0.681
i =  BBC , j =  Breitbart , and cosine: 0.606
i =  BBC , j =  CNN , and cosine: 0.504
i =  BBC , j =  Fox , and cosine: 0.653
i =  Breitbart , j =  CNN , and cosine: 0.404
i =  Breitbart , j =  Fox , and cosine: 0.576
i =  CNN , j =  Fox , and cosine: 0.547


Let's create something like a correlation matrix

In [132]:


def toCorrelationMatrix(df):
    '''
    Transforms a dataframe into a correlation matrix (in the form of dataframe)
    @param df the dataframe
    @return the new simulated correlated matrix (in the form of dataframe)
    '''
    reporterList = ["Aljazaar", "BBC", "Breitbart", "CNN", "Fox"]
    # Spawn an empty data frame as a correlation matrix
    correlationFrame = pd.DataFrame() 
    for i in range(len(reporterList)):
        column = []
        for j in range(len(reporterList)):
            
            # The cosine
            column.append(cosine(df.iloc[i, :], df.iloc[j, :]))
        # And when you are done with completing 1 full column of correlation matrix
        # add it to the dataframe
        correlationFrame[reporterList[i]] = column
    correlationFrame = correlationFrame.rename(index = 
                    {
                        0 : "Aljazaar",
                        1 : "BBC",
                        2 : "Breitbart",
                        3 : "CNN",
                        4 : "Fox"
                   }
                )
    return correlationFrame
        
correlationMatrix = toCorrelationMatrix(DTM)
correlationMatrix    

Unnamed: 0,Aljazaar,BBC,Breitbart,CNN,Fox
Aljazaar,1.0,0.679,0.588,0.533,0.681
BBC,0.679,1.0,0.606,0.504,0.653
Breitbart,0.588,0.606,1.0,0.404,0.576
CNN,0.533,0.504,0.404,1.0,0.547
Fox,0.681,0.653,0.576,0.547,1.0


# Analysis

The max cosine (most similar) occurs between Aljazaar and Fox when discussing the Khashoggi murder (cosine = 0.679). BBC comes to a close second in terms of being similar with Aljazaar. So not surprisingly, BBC and Fox are quite similar to each other when compared to other news outlets. The minimum cosine (most dissimilar) occured between Breitbart and CNN (cosine = 0.404).

If we had used a smaller set of `stopWords`, then the similarities among all the newspaper reports will increase.