# Coding Discussion 04
### Ella Zhang

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in the data
alj = open('../Data/aljazeera-khashoggi.txt','r').read()
bbc = open('../Data/bbc-khashoggi.txt','r').read()
bre = open('../Data/breitbart-khashoggi.txt','r').read()
cnn = open('../Data/cnn-khashoggi.txt','r').read()
fox = open('../Data/fox-khashoggi.txt','r').read()
stop_words = pd.read_csv('../Data/stop_words.csv')

# Convert stopwords into a list
stopwords = stop_words.word.to_list()

In [3]:
def tokenize(text=None):
    """
    This is a function that takes in a string as input and outputs a list of separated words contained in the string.

    Args:
        text (str): a string that needs to be simplified

    Returns:
        list: a list of words excluding stopwords
    """
    
    # Clean up the string
    text = text.lower()
    text = text.replace('.','')
    text = text.replace(',','')
    text = text.replace('-','')
    text = text.replace(':','')
    text = text.replace("?",'')
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.replace('"','')
    text = text.replace('“','')
    text = text.replace('”','')
    text = text.replace("\'s",'')
    text = text.replace("\'re",'')
    text = text.replace("\'ve",'')
    text = text.replace("’s",'')
    text = text.replace("\'",'')
    
    # Separate words
    text_list = text.split()
    
    # Remove stopwords
    text_list2 = [word for word in text_list if word not in stopwords]
    
    return text_list2

In [4]:
def convert_text_to_dtm(txt):
    '''
    This is a function that converts text into a document term matrix.

    Args:
        txt (str): a string that needs to be converted

    Returns:
        DataFrame: reflects the number of times each word appears in a text
    '''
    
    # Create an empty dictionary
    d = dict()
    
    # Count the number of times each word appears
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
            
    return pd.DataFrame(d)

In [5]:
def gen_DTM(texts=None):
    '''
    This is a function that generates a document term matrix.

    Args:
        texts (list): a list of strings that needs to be converted

    Returns:
        DataFrame: reflects the number of times each word appears in every text
    '''
    
    # Create an empty dataframe
    DTM = pd.DataFrame()
    
    # Row bind
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True)

    # Fill in any missing values with 0s
    DTM.fillna(0, inplace=True)
    
    return DTM

In [6]:
# Generates a document term matrix for all news reports
D = gen_DTM([alj, bbc, bre, cnn, fox])
D

Unnamed: 0,$50bn,1,108,11,12,15,15member,18,2,28,...,widely,withheld,woods,world,worse,writer,yalova,yelova,£385bn,—
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0,0.0,3.0


In [7]:
def cosine(a,b):
    '''
    This is a function that takes in two vectors as input and outputs their level of similarity.

    Args:
        a,b (list): two arrays of numbers

    Returns:
        float : a number on a scale of 0 to 1 representing cosine similarity
    '''
    
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))  )
    return cos

In [8]:
# Create an empty dataframe for cosine similarity
D_sim = pd.DataFrame(index = D.index.values, columns = D.index.values)

# Fill in the values
for i in D_sim.index.values:
    for j in D_sim.columns:
        D_sim.iloc[i,j] = cosine(D.iloc[i].values, D.iloc[j].values)

# Rename the indices and columns
D_sim.index = ['alj', 'bbc', 'bre', 'cnn', 'fox']
D_sim.columns = ['alj', 'bbc', 'bre', 'cnn', 'fox']
D_sim

Unnamed: 0,alj,bbc,bre,cnn,fox
alj,1.0,0.705115,0.602549,0.590484,0.7149
bbc,0.705115,1.0,0.622728,0.556871,0.687858
bre,0.602549,0.622728,1.0,0.416451,0.575259
cnn,0.590484,0.556871,0.416451,1.0,0.593481
fox,0.7149,0.687858,0.575259,0.593481,1.0


Each news site reports on the Khashoggi scandal in a slight different way. Aljazeera, BBC, and fox talk about the story in a relatively more similar way, with a cosine similarity of aorund 0.7. Breitbart and CNN has the most different stories.

In [9]:
# Cut stopwords into half
stopwords = stopwords[:int(len(stopwords)/2)]

In [10]:
# Generates a new document term matrix for all news reports
D2 = gen_DTM([alj, bbc, bre, cnn, fox])
D2

Unnamed: 0,$50bn,1,108,11,12,15,15member,18,2,28,...,world,worse,would,writer,yalova,yelova,yet,your,£385bn,—
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,1.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,2.0,2.0,0.0,1.0,0.0,0.0,0.0,3.0


In [11]:
# Create an empty dataframe for new cosine similarity
D2_sim = pd.DataFrame(index = D2.index.values, columns = D2.index.values)

# Fill in the values
for i in D2_sim.index.values:
    for j in D2_sim.columns:
        D2_sim.iloc[i,j] = cosine(D2.iloc[i].values, D2.iloc[j].values)

# Rename the indices and columns
D2_sim.index = ['alj', 'bbc', 'bre', 'cnn', 'fox']
D2_sim.columns = ['alj', 'bbc', 'bre', 'cnn', 'fox']
D2_sim

Unnamed: 0,alj,bbc,bre,cnn,fox
alj,1.0,0.869378,0.836775,0.765647,0.857789
bbc,0.869378,1.0,0.906174,0.737645,0.892726
bre,0.836775,0.906174,1.0,0.663834,0.863968
cnn,0.765647,0.737645,0.663834,1.0,0.742315
fox,0.857789,0.892726,0.863968,0.742315,1.0


If I remove only half of the stopwords, the similarity between all news sites increases. BBC and Breitbart now has the most similar story, with a cosine similarity of above 0.9.