# Coding Discussion 4
### Colette Yeager

In [170]:
import numpy as np
import pandas as pd

In [171]:
# Create a function for reading in the data
def read_data(file):
    """
    This function reads in a file.
    
    Arguments
    ---------
    file: str
        A file path name
    
    Return
    ------
    content: str
        A string for the data from the file
    """
    
    with open(file) as f:
      content = f.read()
      return(content)

In [172]:
# Read in files
aljazeera = read_data("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/aljazeera-khashoggi.txt")
bbc = read_data("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/bbc-khashoggi.txt")
breitbart = read_data("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/breitbart-khashoggi.txt")
cnn = read_data("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/cnn-khashoggi.txt")
fox = read_data("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/fox-khashoggi.txt")
stop_words = pd.read_csv("/Users/coletteyeager/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/stop_words.csv")

files = [aljazeera, bbc, breitbart, cnn, fox]

#### Without removing common words

In [173]:
# Standardize list of words
def tokenize(text):
    """
    This function converts text to a list of individual words.
    
    Arguments
    ---------
    text: str
        A string from the text file
    
    Return
    ------
    text_list: list of strings
        A list containing individual words
    """
    text = text.lower()
    text = text.replace('.', '')
    text = text.replace(',', '')
    text = text.replace('(', '')
    text = text.replace(')', '')
    text = text.replace("’", '')
    text = text.replace("'", '')
    text = text.replace('-', ' ')
    text = text.replace(':', ' ')
    text = text.replace('“', '')
    text = text.replace('”', '')
    text = text.replace('—', ' ')
    text = text.replace('"', '')
    text = text.replace('[', '')
    text = text.replace(']', '')
    text = text.replace('?', '')
    text = text.replace('0', '')
    text = text.replace('1', '')
    text = text.replace('2', '')
    text = text.replace('3', '')
    text = text.replace('4', '')
    text = text.replace('5', '')
    text = text.replace('6', '')
    text = text.replace('7', '')
    text = text.replace('8', '')
    text = text.replace('9', '')
    text = text.replace('$', '')
    text = text.replace('£', '')
    text_list = text.split()
    return text_list

In [174]:
# Turn file into a document term matrix
def convert_text_to_dtm(text):
    """
    This function converts text into a document term matrix
    
    Arguments
    ---------
    txt: list of strings
        A list containing individual words
    
    Return
    ------
    matrix: pandas DataFrame
        A DataFrame for the document term matrix
    """
    # Create empty dictionary
    d = dict()
    
    # Add to dictionary, increasing count for every occasion of a word
    for word in tokenize(text):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    
    # Turn into DataFrame
    matrix = pd.DataFrame(d)
    return matrix

In [175]:
# Turn all files into a document term matrix
def gen_DTM(texts):
    """
    This function converts a list of texts into a document term matrix
    
    Arguments
    ---------
    texts: list of strings
        A list containing the file texts
    
    Return
    ------
    matrix: pandas DataFrame
        A DataFrame for the document term matrix
    """
    # Create empty DataFrame
    matrix = pd.DataFrame()
    
    # Combine document term matrices for every text
    for text in texts:
        entry = convert_text_to_dtm(text)
        matrix = matrix.append(entry, ignore_index = True, sort = True)
    
    # Fill in missing values
    matrix.fillna(0, inplace = True)
    return matrix

In [176]:
# Create matrix
D = gen_DTM(files)
D

Unnamed: 0,a,abdulaziz,able,about,absent,accident,accidentally,accidentallyerdogan,according,account,...,working,world,worse,would,writer,yalova,year,yelova,yet,your
0,11,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,0.0,0.0,2,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,1.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,3.0
2,11,2.0,0.0,2,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,14,0.0,1.0,1,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,14,0.0,0.0,4,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0


In [177]:
def cosine(a, b):
    """
    This function finds the cosine of two texts
    
    Arguments
    ---------
    a,b: Two arrays of integers
        Arrays containing the number of instances of each word
    
    Return
    ------
    cos: Integer
        A number representing how similar two arrays are
    """
    cos = round(np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))), 3)
    return cos

In [178]:
# Create a dictionary for D values
text_dict = {'aljazeera':D.iloc[0].values,
'bbc':D.iloc[1].values,
'breitbart':D.iloc[2].values,
'cnn':D.iloc[3].values,
'fox':D.iloc[4].values}

# Get list of names
names = list(text_dict.keys())

# Create empty DataFrame
correlation = pd.DataFrame(columns = names, index = names)

# Put cosine values in for the DataFrame values
for i in range(5):
    for j in range(5):
        correlation.iloc[i][j] = cosine(text_dict[names[i]], text_dict[names[j]])

In [179]:
correlation

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.87,0.839,0.734,0.844
bbc,0.87,1.0,0.9,0.743,0.889
breitbart,0.839,0.9,1.0,0.686,0.87
cnn,0.734,0.743,0.686,1.0,0.74
fox,0.844,0.889,0.87,0.74,1.0


#### Remove common words

In [180]:
# Convert stop_words to a list
stop_words = list(stop_words['word'])

# Create list of columns to remove from D
cols = list(D.columns)
exclude = [item for item in cols if item in stop_words]

# Drop common columns
D_dropped = D.drop(columns = exclude)

In [181]:
D_dropped

Unnamed: 0,abdulaziz,absent,accident,accidentally,accidentallyerdogan,account,accounts,accusation,accusing,acknowledged,...,weeks,white,widely,withheld,woods,world,worse,writer,yalova,yelova
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0


In [182]:
# Create a dictionary for D_dropped values
text_dict_dropped = {'aljazeera':D_dropped.iloc[0].values,
'bbc':D_dropped.iloc[1].values,
'breitbart':D_dropped.iloc[2].values,
'cnn':D_dropped.iloc[3].values,
'fox':D_dropped.iloc[4].values}

# Get list of names
names_dropped = list(text_dict_dropped.keys())

# Create empty DataFrame
correlation_dropped = pd.DataFrame(columns = names_dropped, index = names_dropped)

# Put cosine values in for the DataFrame values
for i in range(5):
    for j in range(5):
        correlation_dropped.iloc[i][j] = cosine(text_dict_dropped[names_dropped[i]], text_dict_dropped[names_dropped[j]])

In [183]:
# Look at new correlation table
correlation_dropped

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,1.0,0.678,0.597,0.532,0.695
bbc,0.678,1.0,0.596,0.501,0.645
breitbart,0.597,0.596,1.0,0.381,0.556
cnn,0.532,0.501,0.381,1.0,0.535
fox,0.695,0.645,0.556,0.535,1.0


In [184]:
# Compare the two tables
correlation - correlation_dropped

Unnamed: 0,aljazeera,bbc,breitbart,cnn,fox
aljazeera,0.0,0.192,0.242,0.202,0.149
bbc,0.192,0.0,0.304,0.242,0.244
breitbart,0.242,0.304,0.0,0.305,0.314
cnn,0.202,0.242,0.305,0.0,0.205
fox,0.149,0.244,0.314,0.205,0.0


#### Discussion

Initially, all of the news reports appeared to be very similar to each other - except for Breitbart and CNN, all of the sources had above cosine similarity score above 0.7, which is very high. BBC seemed to have the highest cosine similarity scores - 0.9 with Breitbart, 0.89 with Fox, and 0.86 with Aljazeera. Breitbart and Fox were also very similar, at 0.87, which suggests that the three most closely linked were BBC, Breitbart, and Fox. CNN and Breitbart were the least similar, at 0.68, though this is still relatively high.

Once the common words were removed, they all dropped to be below 0.7, with most of them below 0.6. The most similar became Aljazeera, BBC, and Fox, with all of their cosine similarities scores above 0.6. Breitbart's scores dropped a lot, with the lowest similarity score still being with CNN, at 0.381. This is a very dramatic drop from 0.68. BBC and Breitbart's connection changed a lot as well, from 0.9 to 0.596. 

In general, Breitbart had the biggest changes, with decreases over 0.3, whereas all the others were closer to 0.2, which suggests that it used a lot of common words. Aljazeera, on the other hand, had the lowest changes - only a difference of 0.15 with Fox and 0.192 with BBC.