In [1]:
# Import relevant packages
import pandas as pd
import numpy as np
import os

In [2]:
# Set working directory
os.chdir('/Users/nikhilaiyer/Documents/GRAD SCHOOL/ppol564/coding_discussions_ppol564_fall2021/04_coding_discussion/Data')

In [3]:
# Create a function to read in necessary files
def file_reader(og_file):
    '''
    This function reads the news article in and returns it to a variable.
    
    Arguments
    ---------
    .txt: File with the news article
    
    Return
    ------
    Str: The "opened" file
    '''
    open_file = open(og_file).read()
    return open_file

In [4]:
# Call function on files
aljazeera = file_reader("aljazeera-khashoggi.txt")
bbc = file_reader("bbc-khashoggi.txt")
breitbart = file_reader("breitbart-khashoggi.txt")
cnn = file_reader("cnn-khashoggi.txt")
fox = file_reader("fox-khashoggi.txt")
stop_words = pd.read_csv('stop_words.csv')['word'].tolist()
all_stories = [aljazeera, bbc, breitbart, cnn, fox]

In [5]:
# Replace puncuation in text, and then turn text into a list of words
def tokenize(text):
    '''
    This function takes in a text file (str) and changes the whole file to lower case, removes all the puncuations,
    and removes the stop words provided to us.
    
    Arguments
    ---------
    Str: text 
    
    Return
    ------
    List: Text file as a list with the removed items above
    '''
    text = text.lower()
    text = text.replace("!", "") #remove exclamation mark
    text = text.replace("@", "") #remove at symbol
    text = text.replace("#", "") #remove pound sign
    text = text.replace("$", "") #remove dollar sign
    text = text.replace("*", "") #remove asterisk
    text = text.replace("(", "") #remove open parenthetical
    text = text.replace(")", "") #remove close parenthetical
    text = text.replace("-", "") #remove dash
    text = text.replace("+", "") #remove plus sign
    text = text.replace("=", "") #remove equals sign
    text = text.replace("[", "") #remove open bracket
    text = text.replace("]", "") #remove closed bracket
    text = text.replace(":", "") #remove colon
    text = text.replace(";", "") #remove semi colon
    text = text.replace("'", "") #remove single quote
    text = text.replace('"', "") #remove double quote
    text = text.replace("”",' ') #remove special quotation marks
    text = text.replace("“",' ') #remove special quotation marks
    text = text.replace(",", "") #remove comma
    text = text.replace(".", "") #remove period
    text = text.replace("?", "") #remove question mark
    text = text.replace("0", "") #remove no. 0
    text = text.replace("1", "") #remove no. 1
    text = text.replace("2", "") #remove no. 2
    text = text.replace("3", "") #remove no. 3
    text = text.replace("4", "") #remove no. 4
    text = text.replace("5", "") #remove no. 5
    text = text.replace("6", "") #remove no. 6
    text = text.replace("7", "") #remove no. 7
    text = text.replace("8", "") #remove no. 8
    text = text.replace("9", "") #remove no. 9
    text_list = text.split()
    text_list = [word for word in text_list if word not in stop_words]
    return text_list

In [6]:
def dtm(text):
    '''
    This function takes in a list of the news article words, counts the frequency of the words, and adds it to a dictionary.
    
    Arguments
    ---------
    Str: Takes in a list of words from the news article
    
    Return
    ------
    DataFrame: Word bank of all the words in the file, with the frequency of appearance
    '''
    word_bank = dict()
    text = tokenize(text)
    for word in text:
        if word in word_bank:
            word_bank[word][0] += 1
        else:
            word_bank[word] = [1]
    return pd.DataFrame(word_bank)


In [7]:
def combo_dtm(all_texts):
    '''
    This function combines all the data frames developed in the dtm() function of the word counts of each article.
    
    Arguments
    ---------
    List: A list of lists of the news article words
    
    Return
    ------
    DataFrame: Word bank of all the words in the file, with the frequency of appearance in a combined data frame for all articles passed in
    '''
    full_dtm = pd.DataFrame()
    for text in all_texts:
        curr_dtm = dtm(text)
        full_dtm = full_dtm.append(pd.DataFrame(curr_dtm), ignore_index = True, sort = True)
    full_dtm.fillna(0, inplace = True)
    return full_dtm

In [9]:
# A full data frame of all the news articles and their word count frequencies
combo_word_bank = combo_dtm(all_stories)
combo_word_bank

Unnamed: 0,abdulaziz,absent,accident,accidentally,accidentallyerdogan,account,accounts,accusation,accusing,acknowledged,...,withheld,woods,world,worse,writer,yalova,yearold,yelova,£bn,—
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,3.0


In [10]:
def cosine(a, b):
    '''
    This function calculates the cosine similarity of two article word counts.
    
    Arguments
    ---------
    Array (A): Words counted in article A
    Array (B): Words counted in article B
    
    Return
    ------
    int: Cosine similarity value of the two 
    '''
    cos = np.dot(a, b) / (np.sqrt(np.dot(a, a)) * np.sqrt(np.dot(b, b)))
    return cos

In [16]:
# Setting up a blank cosine matrix which is the appropriate size and correct headings
blank_cos_matrix = pd.DataFrame(index = ['Aljazeera', 'BBC', 'Breitbart', 'CNN', 'Fox'], columns = ['Aljazeera', 'BBC', 'Breitbart', 'CNN', 'Fox'])

In [17]:
def cr_matrix(matrix, word_bank):
    '''
    This function creates the cosine matrix of all the articles compared to one another.
    
    Arguments
    ---------
    Data Frame (matrix): Takes in an empty matrix to fill with values
    Data Frame (word bank): A word bank of all the word frequency counts
    
    Return
    ------
    Data Frame (matrix): Returns the matrix now filled with cosine values
    '''
    for i in range(len(matrix.index)):
        for c in range(len(matrix.columns)):
            text_1 = word_bank.iloc[i].values
            text_2 = word_bank.iloc[c].values
            cos_val = round(cosine(text_1, text_2), 4)
            matrix.iloc[i, c] = cos_val
    return matrix

In [18]:
# Creating a cosine matrix of the news articles word frequencies
cos_matrix_1 = cr_matrix(blank_cos_matrix, combo_word_bank)
cos_matrix_1

Unnamed: 0,Aljazeera,BBC,Breitbart,CNN,Fox
Aljazeera,1.0,0.6796,0.5877,0.533,0.6825
BBC,0.6796,1.0,0.5833,0.5029,0.6298
Breitbart,0.5877,0.5833,1.0,0.3684,0.5489
CNN,0.533,0.5029,0.3684,1.0,0.5188
Fox,0.6825,0.6298,0.5489,0.5188,1.0


##### Initial Cosine Matrix Analysis
The cosine matrix shows us which articles are most similar based on the value seen - obviously a value of 1 means that the articles are exactly the same. The most similar articles seem to be Fox and Aljazeera at 0.6825, followed by BBC and Aljazeera. Interestingly Aljazeera and BBC are the only mainly international news sources on this list (CNN definitely has an international branch, maybe not the focus). The least similar articles are CNN and Breitbart, which is to be expected seeing how the news outlets report on any type of news they lean politically on opposite ends.

In [14]:
# Based on the top common words that are most likely repeated in all articles, creating a list of extra stop words and adding them to the stop words list
extra_words = ["saudi", "erdogan", "jamal", "khashoggi", "khashoggis", "turkish", "turkey", "istanbul", "arabia", "bin", "saudis", "mohammed"]
stop_words.extend(extra_words)

In [19]:
# Creating a new data frame of frequencies and passing that in to create a new cosine comparison matrix
combo_word_bank_extra = combo_dtm(all_stories)
cos_matrix_2 = cr_matrix(blank_cos_matrix, combo_word_bank_extra)
cos_matrix_2

Unnamed: 0,Aljazeera,BBC,Breitbart,CNN,Fox
Aljazeera,1.0,0.4633,0.4133,0.2391,0.4752
BBC,0.4633,1.0,0.4422,0.2489,0.4281
Breitbart,0.4133,0.4422,1.0,0.1378,0.4286
CNN,0.2391,0.2489,0.1378,1.0,0.2237
Fox,0.4752,0.4281,0.4286,0.2237,1.0


##### Updated Cosine Matrix Analysis
For this matrix, we removed extra words from the frequency counter - the words selected were based on the highly used ones that were most likely to appear in every article (names, locations); basically more fact based reporting words. Once we do this, we still see similar results as before with CNN and Breitbart being the most different and Aljazeera, BBC, and Fox being the most similar. This further proves that the actual opinions are differing in the news outlets reporting, not just the reporting style or fact based information that's being presented.