In [115]:
import numpy as np 
import pandas as pd

In [117]:
#Loading in Data 
cnn = open("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/cnn-khashoggi.txt", "r")
aljazeera = open("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/aljazeera-khashoggi.txt", "r")
bbc = open("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/bbc-khashoggi.txt", "r")
breitbart = open("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/breitbart-khashoggi.txt", "r")
fox = open("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/fox-khashoggi.txt", "r")
stop_words = pd.read_csv("/Users/lawandyaseen/Desktop/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/stop_words.csv")

In [118]:
#converting the TXT files into long strings
cnn_as_string = cnn.read()
aljazeera_as_string = aljazeera.read()
bbc_as_string = bbc.read()
breitbart_as_string = breitbart.read()
fox_as_string = fox.read()
stop_words = stop_words["word"].to_list()#converting dataframe into list based on column name

In [119]:
JK_articles = [] #empty list 
JK_articles.append(cnn_as_string) #adding each article's string into a single list 
JK_articles.append(aljazeera_as_string)
JK_articles.append(bbc_as_string)
JK_articles.append(fox_as_string)
JK_articles.append(breitbart_as_string)

In [132]:
#function to clean up the article content 
def tokenize(text = None):
    '''
    Removes any unneeded symbols from strings and removes words from stopwords.csv file provided
    
    Arguments 
    --------
    text: string 
    
    Return
    -------
    list with words that are not in stopwords or contain any symbols 
    '''
    text = text.lower() #converting all letters to lowercase 
    text = text.replace('.','') #following are removing any miscellanous values or signs that were used in the articles
    text = text.replace('!','')
    text = text.replace('?','')
    text = text.replace('-','')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace(',','')
    text = text.replace('"','')
    text = text.replace('”','')
    text = text.replace(':','')
    text = text.replace('/','')
    text = text.replace('1','')
    text = text.replace('2','')
    text = text.replace('3','')
    text = text.replace('4','')
    text = text.replace('5','')
    text = text.replace('6','')
    text = text.replace('7','')
    text = text.replace('8','')
    text = text.replace('9','')
    text = text.replace('“','')
    text = text.replace('0','')
    text = text.replace('$','')  
    text = text.replace('£','')
    text = text.replace('—','')
    text = text.split() #divides the strings into a list so that stop words can be removed in next step
    final_list = [word for word in text if word not in stop_words] #checking for commonly used "filler" words
    return final_list

In [133]:
#function to turn it into DTM matrix
def convert_text_to_dtm(vals):
    '''
    Takes string and converts into a document term matrix 
    
    Arguments 
    --------
    vals: string 
    
    Return
    -------
    dataframe with unique words and count of usage of word in article 
    '''
    word_count = dict()
    for word in tokenize(vals):
        if word in word_count:
            word_count[word][0] += 1
        else:
            word_count[word] = [1]
    return pd.DataFrame(word_count)

In [136]:
def create_DTM(strings =None):
    '''
    Creating a DTM for each article and merging each DTM together 
    
    Arguments 
    --------
    strings: list of strings
    
    Return
    -------
    dataframe with unique words and count of usage of word in each article, each row is different article 
    '''
    DTM = pd.DataFrame()
    for string in strings:
        val = convert_text_to_dtm(string)
        DTM = DTM.append(pd.DataFrame(val),ignore_index=True,sort=True)
    
    DTM.fillna(0, inplace=True) #NA if word is not used in article
    return DTM

In [125]:
def cosine(x,y):
    '''
    Calculating cosine similarity given two arrays  
    
    Arguments 
    --------
    x: Array 1 
    y: Array 2
    
    Return
    -------
    Value between 0-1 indicating the cosine of the angle between the two arrays
    '''
    cos = np.dot(x,y)/(np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y))) #using cosine equation as defined in class
    return cos

In [138]:
#creating DTM for Jamal Khashoggi articles
DTM_results = create_DTM(JK_articles)

In [139]:
#pulling the results from the DTM for each individual articles 
cnn_results = DTM_results.iloc[0].values
aljazeera_results = DTM_results.iloc[1].values
bbc_results = DTM_results.iloc[2].values
breitbart_results = DTM_results.iloc[3].values
fox_results = DTM_results.iloc[4].values

#### Comparisons of Interest 
The following cosine calculations are done to compare the publications' similarity in reporting on Erdogan's response to the murder of Jamal Khashoggi. The comparsions chosen were based on ideological and political points of interest. 

The first comparison was between Breitbart and CNN - two publications with very different political leanings.

In [140]:
cosine(breitbart_results, cnn_results)

0.5223686656653737

The second comparison is between Fox and Breitbart, who are relatively more ideologically aligned than the previous comparison. There is a slight increase in substantive similarity between the articles.

In [141]:
cosine(fox_results,breitbart_results)

0.5526344598374455

The third comparison looked at the two publications that are not based in the United States. Al-Jazeera is based in Qatar while BBC is a British publication. The content within the reporting is more similar than the comparison of the previous American reporting. 

In [142]:
cosine(aljazeera_results,bbc_results)

0.6785708770792225

Lastly, as a reference, Al-Jazeera and CNN were compared. The reporting between these two articles had nearly the same level of content similarity than Breitbart and CNN. 

In [143]:
cosine(aljazeera_results, cnn_results)

0.5321942095223737

Overall, of the chosen comparsions, Al-Jazeera and BBC had the most similar reporting content. However, there was a level of similarity among all of the comparsions. That may be changed if some of the more common words that are not considered fillers were removed from the DTM, like Jamal Khashoggi's name, or the world leaders involved in the story like Mohammed Bin Salman, Donald Trump and Recip Erdogan.  