In [2]:
# Import modules
import numpy as np
import pandas as pd
import requests

In [3]:
# Read Al Jazeera article as a string
with open("aljazeera-khashoggi.txt") as aljazeera_text:
    aj = aljazeera_text.read()    

In [5]:
# Read BBC article as a string
with open("bbc-khashoggi.txt") as bbc_text:
    bbc = bbc_text.read()  

In [6]:
# Read Breitbart article as a string
with open("breitbart-khashoggi.txt") as breitbart_text:
    bb = breitbart_text.read()  

In [7]:
# Read CNN article as a string
with open("cnn-khashoggi.txt") as cnn_text:
    cnn = cnn_text.read()  

In [8]:
# Read Fox article as a string
with open("fox-khashoggi.txt") as fox_text:
    fox = fox_text.read()  

In [12]:
# Open stop_words
stop_words = pd.read_csv("stop_words.csv")

In [17]:
def tokenize(text=None):
    '''
    This is a function used to split strings that were formatted in sentences into a list of individual words
    '''
    text = text.lower()
    text = text.replace('-','')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('"','')
    text = text.replace(',','')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.replace('?','')
    text = text.replace('.','')
    text_list = text.split()
    return text_list

['saudi',
 'officials',
 'planned',
 'the',
 'savage',
 'murder',
 'of',
 'saudi',
 'writer',
 'jamal',
 'khashoggi',
 'days',
 'before',
 'his',
 'death',
 'turkish',
 'president',
 'recep',
 'tayyip',
 'erdogan',
 'said',
 'tuesday',
 'erdogan',
 'revealed',
 'the',
 'details',
 'of',
 'the',
 'country’s',
 'investigation',
 'into',
 'khashoggi’s',
 'killing',
 'after',
 'he',
 'walked',
 'into',
 'the',
 'saudi',
 'consulate',
 'in',
 'istanbul',
 'on',
 'oct',
 '2',
 'contradicting',
 'saudi',
 'arabia’s',
 'explanation',
 'that',
 'the',
 'writer',
 'was',
 'killed',
 'in',
 'a',
 '“fistfight”',
 'erdogan',
 'fell',
 'short',
 'of',
 'blaming',
 'saudi',
 'crown',
 'prince',
 'mohammed',
 'bin',
 'salman',
 'and',
 'made',
 'no',
 'mention',
 'of',
 'whether',
 'a',
 'tape',
 'exists',
 'of',
 "khashoggi's",
 'killing',
 'however',
 'he',
 'kept',
 'the',
 'pressure',
 'on',
 'the',
 'kingdom',
 'with',
 'his',
 'demands',
 'for',
 'punishment',
 'of',
 'all',
 'the',
 'people',
 

In [18]:
def convert_text_to_dtm(txt):
    '''
    This function is used to convert a list of strings into a single row document term matrix
    '''
    d = dict()
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [24]:
def gen_DTM(texts=None):
    '''
    This function takes a list of list of strings and individually converts them into separate document term matrices.
    After they're converted into matrices, they're row-binded and 0s are inserted as missing values.
    '''
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

In [26]:
full_DTM = gen_DTM([aj, bbc, bb, cnn, fox])

In [40]:
# Remove columns from DTM that have entries in stop_words
full_dtm_cols = list(full_DTM.columns)
for i in range(len(full_dtm_cols)):
    if full_dtm_cols[i] in stop_words.values:
        full_DTM = full_DTM.drop([full_dtm_cols[i]], axis=1)

In [42]:
# Remaining words in the full document term matrix
full_DTM.columns.values

array(['$50bn', "'where", '1', '11', '12', '15', '15member', '18', '1:08',
       '2', '28', '2r', '4:30', '55mile', '59', '59yearold',
       '90kilometer', '9:50', 'abdulaziz', 'abdulaziz”', 'absent',
       'accident', 'accidentally', 'accidentallyerdogan', 'account',
       'accounts', 'account”', 'accusation', 'accusing', 'acknowledged',
       'added', 'adding', 'addition', 'address', 'addressing', 'adel',
       'admitted', 'admitting', 'advance', 'agency', 'ago', 'aides', 'ak',
       'akp', 'al', 'alerted', 'alibi', 'alive', 'aljubeir', 'allegation',
       'allegedly', 'allowed', 'ally', 'alqahtani', 'amounted', 'ankara',
       'ankaraamong', 'announced', 'anonymous', 'answered', 'answering',
       'answers', 'anticipated', 'appeared', 'appearing', 'applause',
       'arabia', "arabia's", 'arabian', 'arabia’s', 'arabic', 'arguing',
       'arrested', 'arrests', 'arrival', 'arrived', 'arrives',
       'assassination', 'attempt', 'attempts', 'attendees', 'attributed',
       

In [45]:
# Create numpy arrays for each of the rows in the dictionary term matrix to determine similarity

# Al Jazeera
aj_row = full_DTM.iloc[0].values

# BBC row
bbc_row = full_DTM.iloc[1].values

# Breitbart row
bb_row = full_DTM.iloc[2].values

# CNN row
cnn_row = full_DTM.iloc[3].values

# Fox row
fox_row = full_DTM.iloc[4].values

In [46]:
# Create cosine function
def cosine(a,b):
    '''
    This function returns the cosine of the angle of two vectors, in this case numpy arrays from the DTM.
    This function will return a value of cosine for quadrant I, which will be somewhere between 0 and 1.
    The closer the value is to 1, the more similar the words will be.
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))
    return cos

In [49]:
# Test similarity between Al Jazeera and BBC
cosine(aj_row, bbc_row).round(3)

0.678

In [50]:
# Test similarity between Al Jazeera and Breitbart
cosine(aj_row, bb_row).round(3)

0.555

In [51]:
# Test similarity between Al Jazeera and CNN
cosine(aj_row, cnn_row).round(3)

0.532

In [52]:
# Test similarity between Al Jazeera and Fox
cosine(aj_row, fox_row).round(3)

0.66

Based on these numbers, the article from Al Jazeera was most similar to the article from the BBC. This could make sense as Al Jazeera and BBC are both based outside the United States while the others are not.

Now, let's test the similarity between the BBC articles and the articles from the United States based sources to see if this holds up.

In [54]:
# Test similarity between BBC and Breitbart
cosine(bbc_row, bb_row).round(3)

0.564

In [55]:
# Test similarity between BBC and CNN
cosine(bbc_row, cnn_row).round(3)

0.504

In [56]:
# Test similarity between BBC and Fox
cosine(bbc_row, fox_row).round(3)

0.606

Seeing as these numbers are all closer to 0 than 0.678, this hypothesis seems to hold up. Now, we can see if this is also true for the United States based sources.

In [57]:
# Test similarity between Breitbart and CNN
cosine(bb_row, cnn_row).round(3)

0.353

In [58]:
# Test similarity between Breitbart and Fox
cosine(bb_row, fox_row).round(3)

0.519

In [59]:
# Test similarity between CNN and Fox
cosine(cnn_row, fox_row).round(3)

0.514

Based on these numbers, it seems as though the United States based sources are actually more similar to the articles from Al Jazeera and BBC than they are to other articles from United States based sources! So, we can not say that the differences between all of them occur due to United States centric perspective in the articles.