In [1]:
import pandas as pd 
import numpy as np

In [2]:
#read in the csv and txt files 
stop_words = pd.read_csv("stop_words.csv")
a = open("aljazeera-khashoggi.txt","r").read()
bbc = open("bbc-khashoggi.txt", "r").read()
b = open("breitbart-khashoggi.txt","r").read()
cnn = open("cnn-khashoggi.txt","r").read()
fox = open("fox-khashoggi.txt","r").read()

In [3]:
#convert the stop_words to lise 
commonwords = stop_words['word'].to_list()

In [4]:
def tokenize(text):
    """
    This is a function that tokenizes a string and removes the punctuations and common words 
    
    Arg: 
        It takes a string 
    
    return: 
        It returns a dataframe that displays the frequency of words in the string 
    """
    # lower the cases and replace the punctuations with space for splitting 
    text = text.lower()
    text = text.replace(':','')
    text = text.replace(',','')
    text = text.replace('"','')
    text = text.replace('“','')
    text = text.replace('”','')
    text = text.replace('.','')
    text = text.replace('—','')
    text = text.replace('\'s','') 
    text = text.replace('?','')
    text = text.replace("\'re",'')
    text = text.replace("\'ve",'')
    text = text.replace("’s",'')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.replace("’",'')
    text = text.replace("'",'')
    
    # tokenize the string 
    text = text.split()  
    
    # create an empty list 
    text_list = []  
    for i in text:
        # remove words in the commonwords list
        if i not in commonwords:    
            text_list.append(i)
   
    # create an empty dictionary 
    d = dict()  
    # count the occurence for every word in the text_list 
    for word in text_list:   
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    
    
    # return the dataframe 
    return pd.DataFrame(d)

In [5]:
def DTM(text):
    """
    This is a function that generates a document-term matrix 
    
    Arg: 
        It takes in a list of strings(texts)
        
    return:
        It returns a dataframes that shows the frequency of words occured in the collection of strings 
    """
    # create an empty dataframe 
    DTM = pd.DataFrame()
    
    for i in text:
        entry = tokenize(i)
        # row bind the dataframes for every element in the list of strings 
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True)
    
    # fill in null values with 0 
    DTM.fillna(0, inplace=True) 
    
    # return the document-term matrix 
    return DTM

In [6]:
dtm = DTM([a, bbc, b, cnn, fox])
dtm

Unnamed: 0,$50bn,-,1,108,11,12,15,15-member,18,2,...,white,widely,withheld,woods,world,worse,writer,yalova,yelova,£385bn
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0


In [7]:
def cos(a,b):
    """
    This is a function that calculate the cosine 
    
    Args:
        It takes two vectors 
        
    Return:
        It returns a float number that shows the cosine value between the two vectors 
    """
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))
    return cos 

In [8]:
# create a dataframe for the cosine matrix 
cosine = pd.DataFrame(index = dtm.index.values, columns = dtm.index.values)

In [9]:
# assign the cosine values to the cosine matrix 
for row in range(len(dtm)):
    for column in range(len(dtm)):
        cosine.loc[row, column] = cos(dtm.loc[row], dtm.loc[column])

In [10]:
# rename the rows and columns 
cosine.rename(index = {0:'aljazeera', 1:'bbc', 2:'breitbart', 3:'cnn', 4:'fox'},
              columns = {0:'aljazeera', 1:'bbc', 2:'breitbart', 3:'cnn', 4:'fox'},
              inplace = True)

In [11]:
cosine

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.703503,0.60288,0.59022,0.718604
bbc,0.703503,1.0,0.620849,0.556356,0.692329
breitbart,0.60288,0.620849,1.0,0.416867,0.578242
cnn,0.59022,0.556356,0.416867,1.0,0.595963
fox,0.718604,0.692329,0.578242,0.595963,1.0


### From the cosine matrix, Fox and Alijazeera report the news in a most similar way, compared to other three news outlet, because they have a cosine of 0.7186. All the news sites report the news in a somewaht similar way, except CNN. The cosine for CNN with each of the other four news sites is never higher than 0.6. Aljazeera reports the news in a similar way as BBC and Fox do. BBC reports the news in a similar way as Aljazeera and Fox do. Breitbart reports the news in a similar way as BBC does. CNN reports the news in a dissimilar way as the other four do, especially Breitbart. Fox reports the news in a similar was as Aljazeera do. 

In [12]:
import random 

In [13]:
# change the commonwords list 
# randomly choose 100 words from the previous commonwords list 

commonwords = random.sample(commonwords, 100)

In [14]:
# Repeat the steps before
# create the document-term matrix 
dtm2 = DTM([a, bbc, b, cnn, fox])

# create an empty dataframe for the cosine matrix 
cosine2 = pd.DataFrame(index = dtm2.index.values, columns = dtm2.index.values)

# fill in the values 
for row in range(len(dtm2)):
    for column in range(len(dtm2)):
        cosine2.loc[row, column] = cos(dtm2.loc[row], dtm2.loc[column])

In [16]:
# rename the rows and columns 
cosine2.rename(index = {0:'aljazeera', 1:'bbc', 2:'breitbart', 3:'cnn', 4:'fox'},
              columns = {0:'aljazeera', 1:'bbc', 2:'breitbart', 3:'cnn', 4:'fox'},
              inplace = True)

In [17]:
cosine2

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.873516,0.843891,0.731514,0.856142
bbc,0.873516,1.0,0.899525,0.752182,0.894091
breitbart,0.843891,0.899525,1.0,0.688279,0.871039
cnn,0.731514,0.752182,0.688279,1.0,0.757763
fox,0.856142,0.894091,0.871039,0.757763,1.0


#### When I change the number of words I remove, reducing it to 100, Aljazeera and BBC report the news in a most similar way. So, it changes from aljazeera-fox to aljazeera-bbc. Plus, CNN reports more similiarly compared to other four news sites, under current criterion. Rougly speaking, when I cut the commonwords list down to 100 in size, all the news sites report the news in a similar way, given that the cosine among them is around 0.7. But, it is still true the CNN reports the news in a most dissimilar than any other four, even though the absolute value for the cosine increases. 