In [275]:
import os
import numpy as np
import pandas as pd

## Load the Data

In [276]:
# Load the txt files
aljazeera = open("../Data/aljazeera-khashoggi.txt", 'r')
bbc = open("../Data/bbc-khashoggi.txt", 'r')
breitbart = open("../Data/breitbart-khashoggi.txt", 'r')
cnn = open("../Data/cnn-khashoggi.txt", 'r')
fox = open("../Data/fox-khashoggi.txt", 'r')

In [277]:
# Pull the content from each article and save them as individual objects of type string
aljazeera_content = aljazeera.read()
bbc_content = bbc.read()
breitbart_content = breitbart.read()
cnn_content = cnn.read()
fox_content = fox.read()

In [278]:
# Now that we've pulled the content we want, close the txt files
aljazeera.close()
bbc.close()
breitbart.close()
cnn.close()
fox.close()

In [279]:
# Load stopwords and turn them into a list we can use to exclude them
stopwords = pd.read_csv("../Data/stop_words.csv")
stopwords_list = stopwords.word.tolist()

## Build the functions

In [280]:
# Build a function that turns each article into a list of its words
def tokenize(text=None):
    """Takes as an input a collection of text as a string, removes punctuation and capitalization, and returns a list of the individual words within the text."""
    #Clean the text, turning all letters into lower case and removing punctuation
    text = text.lower()
    text = text.replace('.','')
    text = text.replace(",", "")
    text = text.replace("\"", "")
    text = text.replace("'s", "")
    text = text.replace("'", "")
    text = text.replace("-", "")
    text = text.replace("“", "")
    text = text.replace("”", "")
    text = text.replace("—", "")
    text = text.replace("?", "")
    text = text.replace(";", "")
    text = text.replace("(", "")
    text = text.replace(")", "")
    text = text.replace("[", "")
    text = text.replace("]", "")
    text = text.replace("{", "")
    text = text.replace("}", "")
    # Split the now cleaned string into a list of individual words
    text_list = text.split()
    # Remove stop words from the list
    text_list2 = [word for word in text_list if word not in stopwords_list]
    return text_list2

In [281]:
# Using the above function, build a new function that converts a text into a Document Term Matrix
def convert_text_to_dtm(txt):
    """Takes as an input a string and returns a document term matrix containing the number of times each word appears in the text."""
    d = dict()
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [282]:
def gen_DTM(texts=None):
    """Takes as an input a list of texts and returns a document term matrix where each row represents the individual text collection."""
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

## Run the Analysis

In [283]:
# Generate the Document Term Matrix for all 5 articles
DTM = gen_DTM([aljazeera_content, bbc_content, breitbart_content, cnn_content, fox_content])

In [284]:
# View the DTM
DTM

Unnamed: 0,$50bn,1,11,12,15,15member,18,1:08,2,28,...,white,widely,withheld,woods,world,worse,writer,yalova,yelova,£385bn
0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [285]:
# Save the values of each row of the DTM as a numpy array so that we can compare vectors 
aljazeera=DTM.iloc[0].values
bbc=DTM.iloc[1].values
breitbart=DTM.iloc[2].values
cnn=DTM.iloc[3].values
fox=DTM.iloc[4].values 

In [286]:
# Build a function that uses the dot product to calculate the cosine similarity of two arrays
def cosine(a,b):
    'This function takes in two series of values and calculates the cosine.'
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))  )
    return cos

In [287]:
# Calculate and print the cosine similarity of each array as a preliminary result
print(cosine(aljazeera,bbc))
print(cosine(aljazeera,breitbart))
print(cosine(aljazeera,cnn))
print(cosine(aljazeera,fox))
print(cosine(bbc,breitbart))
print(cosine(bbc,cnn))
print(cosine (bbc,fox))
print(cosine (breitbart,cnn))
print(cosine(breitbart,fox))
print(cosine(cnn,fox))

0.704951931053164
0.5881235927091389
0.5904843154184692
0.7132817551762246
0.5881765519910633
0.556742328214592
0.6661164572454468
0.3748292282438769
0.5502071308388471
0.5621407236219644


In [288]:
# To make the results easier to read place them in a data frame
# Create an empty dataframe to populate with the cosine values
cosine_df = pd.DataFrame(index=DTM.index.values, columns = DTM.index.values)

In [289]:
# Each row and column of the dataframe will correspond to an outlet, and each value in the dataframe will be the corresponding cosine similarity of the intersecting row and column
# Itterate through the rows of the cosine dataframe and fill in each value with the cosine that corresponds to the row/column pair
for outlet, cos in cosine_df.iterrows():
    for i, f in enumerate(cos):
        cosine_df.loc[outlet,i] = cosine(DTM.iloc[outlet],DTM.iloc[i])

In [290]:
# Set the index and column names to the outlets they correspond to
cosine_df.index = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']
cosine_df.columns = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']

In [291]:
# View the dataframe
cosine_df

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.704952,0.588124,0.590484,0.713282
bbc,0.704952,1.0,0.588177,0.556742,0.666116
breitbart,0.588124,0.588177,1.0,0.374829,0.550207
cnn,0.590484,0.556742,0.374829,1.0,0.562141
fox,0.713282,0.666116,0.550207,0.562141,1.0


In [292]:
# What happens if we only use a portion of the stopwords?
# Use only the first hundred stopwords in the list
stopwords_list = stopwords_list[0:100]

In [293]:
# Repeat the steps used to create the cosine matrix above, this time using the subsetted list of stopwords
DTM2 = gen_DTM([aljazeera_content, bbc_content, breitbart_content, cnn_content, fox_content])
cosine_df2 = pd.DataFrame(index=DTM2.index.values, columns = DTM2.index.values)
for outlet, val in cosine_df2.iterrows():
    for i, entry in enumerate(val):
        cosine_df2.loc[outlet,i] = cosine(DTM2.iloc[outlet],DTM2.iloc[i])
cosine_df2.index = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']
cosine_df2.columns = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']

In [294]:
# The cosine similarities go up when we use only the first 100 stopwords
cosine_df2

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.868758,0.831885,0.759642,0.839637
bbc,0.868758,1.0,0.894565,0.758841,0.890772
breitbart,0.831885,0.894565,1.0,0.679702,0.86858
cnn,0.759642,0.758841,0.679702,1.0,0.7442
fox,0.839637,0.890772,0.86858,0.7442,1.0


In [295]:
# What if instead of just slicing the stopwords list we create a new one with only the most common words?
# Change the stopwords to be only the 15 most common words in the English language
stopwords_list = ['the', 'of', 'and', 'a', 'to', 'in', 'is', 'you', 'that', 'it', 'he', 'was', 'for', 'on', 'are']

In [296]:
# Repeat the steps above to create a cosine matrix now only using the new list of stopwords 
DTM3 = gen_DTM([aljazeera_content, bbc_content, breitbart_content, cnn_content, fox_content])
cosine_df3 = pd.DataFrame(index=DTM3.index.values, columns = DTM3.index.values)
for outlet, val in cosine_df3.iterrows():
    for i, entry in enumerate(val):
        cosine_df3.loc[outlet,i] = cosine(DTM3.iloc[outlet],DTM3.iloc[i])
cosine_df3.index = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']
cosine_df3.columns = ['aljazeera', 'bbc', 'breitbart', 'cnn', 'fox']

In [297]:
# The similarity level goes down!
cosine_df3

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.686816,0.580407,0.56097,0.659787
bbc,0.686816,1.0,0.607498,0.539242,0.62512
breitbart,0.580407,0.607498,1.0,0.380036,0.54145
cnn,0.56097,0.539242,0.380036,1.0,0.51922
fox,0.659787,0.62512,0.54145,0.51922,1.0


## Interpret Results

In [298]:
# the cosine matrix with all of the stop words
cosine_df

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.704952,0.588124,0.590484,0.713282
bbc,0.704952,1.0,0.588177,0.556742,0.666116
breitbart,0.588124,0.588177,1.0,0.374829,0.550207
cnn,0.590484,0.556742,0.374829,1.0,0.562141
fox,0.713282,0.666116,0.550207,0.562141,1.0


In [299]:
# the cosine matrix with a subset of the stop words
cosine_df2

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.868758,0.831885,0.759642,0.839637
bbc,0.868758,1.0,0.894565,0.758841,0.890772
breitbart,0.831885,0.894565,1.0,0.679702,0.86858
cnn,0.759642,0.758841,0.679702,1.0,0.7442
fox,0.839637,0.890772,0.86858,0.7442,1.0


In [300]:
# The cosine matrix with only the most common English words
cosine_df3

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.686816,0.580407,0.56097,0.659787
bbc,0.686816,1.0,0.607498,0.539242,0.62512
breitbart,0.580407,0.607498,1.0,0.380036,0.54145
cnn,0.56097,0.539242,0.380036,1.0,0.51922
fox,0.659787,0.62512,0.54145,0.51922,1.0


In the original analysis, using the full list of stopwords, the biggest difference is between CNN and Breitbart. This was as to be expected, considering how different the political leanings of each outlet is. Somewhat more surpsingly, the most similar stories are the ones published by Fox and Al Jazeera, with Al Jazeera and BBC not far behind. Al Jazeera and BBC being similar makes sense, considering that both are foreign outlets without some of the polarization common in the American media landscape. However I would not have expected Fox and Al Jazeera to be similar, which leads me to wonder whether there is a noticeable difference between the way Fox reports on domestic issues and the way they report on foreign affairs. 

As expected, when we use only the first hundred stopwords, the level of similarity shoots up significantly among all outlets. Breitbart and BBC suddenly become the most similar. From this we can extrapolate that not including enough stopwords can seriously skew the results, increasinly similarity where it doesn't exist. A more interesting result however, occurs when we cut the number of stop words further and use only the 15 most common words in the English language. The level of similarity actually goes down almost across the board, and the increases are so minor that they're basically negligible. In this analysis, Al Jazeera and BBC are now the most similar. This is perhaps because the stories are so similar (they cover the same aspect of the same event) that when we cut out more than just the most common words we inflate the value of topical words, which will be shared across the whole group. This would imply that you don't have to just cut out enough common words, you also have to cut out the *right* words, and cutting out too many words might artificially increase the level of similarity. 