## Coding Discussion 4
### Abigail Paterson
#### 11/7/2021

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

### Read in the data

In [2]:
# change working directory
os.chdir("C:\\Users\\arpat\\OneDrive\\Documents\\McCourt 2021\\data sci\\coding_discussions_ppol564_fall2021\\04_coding_discussion")

In [3]:
#get files
alj_file = open("Data/aljazeera-khashoggi.txt", mode = "rt", encoding = "UTF-8")
aljazeera = alj_file.read()
alj_file.close()

bbc_file = open("Data/bbc-khashoggi.txt", mode = "rt", encoding = "UTF-8")
bbc = bbc_file.read()
bbc_file.close()

brbt_file = open("Data/breitbart-khashoggi.txt", mode = "rt", encoding = "UTF-8")
breitbart = brbt_file.read()
brbt_file.close()

cnn_file = open("Data/cnn-khashoggi.txt", mode = "rt", encoding = "UTF-8")
cnn = cnn_file.read()
cnn_file.close()

fox_file = open("Data/fox-khashoggi.txt", mode = "rt", encoding = "UTF-8")
fox = fox_file.read()
fox_file.close()

stop_words = pd.read_csv("Data/stop_words.csv")
stop_list = stop_words["word"].to_list()

In [4]:
#create list of files
files = [aljazeera, bbc, breitbart, cnn, fox]
text_names = ['aljazeera','bbc','breitbart','cnn','fox']

### Write Methods

In [5]:
#tokenize the list of words, as well as removing stop words
def tokenize(text=None):
    
    ''' This function breaks up a string of multiple words into a list of 
    individual strings for words. This function will also remove the stop words
    Args:
        text (str): a text string
    Returns:
        str: the simplified text string without capitalization or punctuation
    '''
    #make text lowercase
    text.lower()
    
    #remove punctuation
    text = (text
        .replace('.','') 
        .replace('"','') 
        .replace('“','')
        .replace('(','') 
        .replace(')','') 
        .replace(',','')
        .replace('-','') 
        .replace('—','') 
        .replace('\'','')
        .replace('[','') 
        .replace(']','') 
        .replace('?','') 
        .replace('!','') 
        .replace('0','') 
        .replace('1','') 
        .replace('2','')
        .replace('3','') 
        .replace('4','') 
        .replace('5','')
        .replace('6','') 
        .replace('7','') 
        .replace('8','')
        .replace('9','') 
        .replace('10','') 
        )
    
    
    #split into individual words
    text_list = text.split()
    
    #remove stop words
    text_list_stop =  [word for word in text_list if word not in stop_words]
    
    return text_list_stop
    

In [6]:
# test tokenization
cnn_token = tokenize(cnn)
cnn_token[1:10]

['Turkey', 'CNN', 'Jamal', 'Khashoggi', 'died', 'as', 'a', 'result', 'of']

In [7]:
#turn file into a document term matrix
def convert_text_to_dtm(text):
    """
    This function converts text into a document term matrix
    
    Arguments
    ---------
    txt: list of strings
        A list containing individual words
    
    Return
    ------
    matrix: pandas DataFrame
        A DataFrame for the document term matrix
    """
    #create an empty dictionary to count every instance of a word
    word_count = dict()
    for word in tokenize(text):
        #if word is already in dictionary, increase count
        if word in word_count:
            word_count[word][0] += 1
        else:
            #if not, put word in dictionary 
            word_count[word] = [1]
            
    #make a dataframe out of the dict
    dtm = pd.DataFrame(word_count)
    return dtm

In [8]:
#test convert
cnn_dtm = convert_text_to_dtm(cnn)
cnn_dtm

Unnamed: 0,Istanbul,Turkey,CNN,Jamal,Khashoggi,died,as,a,result,of,...,were,scouted,but,noted,later,Khashoggis,body,yet,foundRead,More
0,4,3,1,1,5,2,2,14,2,6,...,1,1,1,1,1,1,1,1,1,1


In [9]:
#turn all files into a DTM
def gen_DTM(texts):
    """
    This function converts a list of texts into a document term matrix
    
    Arguments
    ---------
    texts: list of strings
        A list containing the file texts
    
    Return
    ------
    matrix: pandas DataFrame
        A DataFrame for the document term matrix
    """
    
    #empty datafram
    dtmatrix = pd.DataFrame()
    
    #combine dtms for all texts
    for text in texts:
        entry = convert_text_to_dtm(text)
        dtmatrix = dtmatrix.append(entry, ignore_index = True, sort = True)
    
    #fill in NA values
    dtmatrix.fillna(0, inplace = True)

    return dtmatrix

In [10]:
#test general DTM
DTM = gen_DTM(files)
DTM

Unnamed: 0,$bn,:,A,AK,AKP,Abdulaziz,Abdulaziz”,According,Addressing,Adel,...,woods,working,world,worse,would,writer,yearold,yet,your,£bn
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,3.0,1.0
2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0


In [11]:
#find the cosine similarity
def cosine(a, b):
    """
    This function finds the cosine of two texts
    
    Arguments
    ---------
    a,b: Two arrays of integers
        Arrays containing the number of instances of each word
    
    Return
    ------
    cos: Integer
        A number representing how similar two arrays are
    """
    cos = round(np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))), 3)
    return cos

In [12]:
def cos_matrix(texts,text_names):
    ''' This function returns a matrix (data frame) of the cosine similarities between a set of strings
    Args:
        texts (list): a list containing text string objects
        text_names (list): a list of strings that correspond to the names of the objects in "texts"
    Returns:
        df: A matric of similarities between the input texts
    '''
    #create dataframe to hold cosines
    cosines = pd.DataFrame(columns=text_names)
    
    #counter 
    i = 0
    
    #nested loop through the texts to find cosine between each of them
    for t in texts:
        
        #second counter
        j = 0
        
        for t_inner in texts:
            dtm = gen_DTM([t, t_inner])
            a = dtm.iloc[0].values
            b = dtm.iloc[1].values
            
            #calculate the cosin
            cos = cosine(a,b)
            
            #asign value to correct location
            cosines.loc[text_names[i], text_names[j]] = cos
            
            j+=1
        i+=1
        
    return cosines
       

### Text Analyis

In [13]:
# show cosine similarity between texts
cos_matrix(files, text_names)

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.856,0.816,0.733,0.827
bbc,0.856,1.0,0.887,0.742,0.879
breitbart,0.816,0.887,1.0,0.684,0.862
cnn,0.733,0.742,0.684,1.0,0.732
fox,0.827,0.879,0.862,0.732,1.0


This shows the cosine similarities between the text strings. Values closer to 1 indicate more similar texts. The most similar texts are bbc and breitbart, additionally bbc and fox are very similar as well. This means that that those two articles have the most similar reporting on Khashoggi's murder. The least similar are cnn and fox, so these sites reported on the case very differently. Because this was just one event, there is a possibility that the words inherently related to the case are driving up the similarities, so we will add some of the most relevant words to our stop words list.

In [14]:
# Add some additional stopwords, which are likely to appear in all articles
stop_list.append('jamal')
stop_list.append('khashoggi')
stop_list.append('khashoggis')
stop_list.append('president')
stop_list.append('recip')
stop_list.append('tayyip')
stop_list.append('erdogan')
stop_list.append('turkey')
stop_list.append('turkish')
stop_list.append('istanbul')

stop_words = stop_list

# Rerun the similarity matrix
cos_matrix(files,text_names)

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.661,0.547,0.521,0.656
bbc,0.661,1.0,0.553,0.492,0.604
breitbart,0.547,0.553,1.0,0.347,0.53
cnn,0.521,0.492,0.347,1.0,0.512
fox,0.656,0.604,0.53,0.512,1.0


Removing some of the most common words relating to the story has lowered all of the similarities significantly. Now the most similar articles are Fox news and Aljazeera. The similarity between the BBC and breitbart has lowered all the way from 0.887 to 0.553. The least similar articles have changed as well, with the removed words, now breitbart and CNN are the least similar with a 0.347 similarity. Overall, Fox news is quite similar to all of the other articles, with all of its similarities over 50%. This means that Fox news' coverage is very similar to all the other outlets.