# Coding Discussion #4
#### Morgan Zimmerman

In [184]:
import numpy as np
import pandas as pd
import requests

In [185]:
# Read in aljazeera-khashoggi.txt as lines1
with open('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/aljazeera-khashoggi.txt') as aljazeera:
    story1 = aljazeera.read()
    
# Read in bbc-khashoggi.txt as lines2
with open('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/bbc-khashoggi.txt') as bbc:
    story2 = bbc.read()

# Read in breitbart-khashoggi.txt as lines3
with open('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/breitbart-khashoggi.txt') as breitbart:
    story3 = breitbart.read()

# Read in cnn-khashoggi.txt as lines4
with open('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/cnn-khashoggi.txt') as cnn:
    story4 = cnn.read()

# Read in fox-khashoggi.txt as lines5
with open('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/fox-khashoggi.txt') as fox:
    story5 = fox.read()
    
# Import stop_words file
stop_words = pd.read_csv('/Users/morganzimmerman/Desktop/GitHub/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/stop_words.csv')
stop_words_list = stop_words['word'].tolist()

In [186]:
# Create list to store all stories
text = [story1, story2, story3, story4, story5]

In [187]:
def tokenize(text=None):
    '''
    This function takes a string of text and ouputs a list of all words from the string in lowercase and without any punctuation or numbers.
    
    Arguments
    ---------
    str: string of text
    
    Return
    ------
    list: list of all words from the string in lowercase and without any punctuation or numbers.
    '''
    text = text.lower()
    text = text.replace('.',' ') # Removes periods
    text = text.replace('!',' ') # Removes exclamation points
    text = text.replace('?',' ') # Removes question marks
    text = text.replace(',',' ') # Removes commas
    text = text.replace('(',' ') # Removes left parentheses
    text = text.replace(')',' ') # Removes right parentheses
    text = text.replace('[',' ') # Removes left square brackets
    text = text.replace(']',' ') # Removes right square brackets
    text = text.replace(';',' ') # Removes semi-colons
    text = text.replace(':',' ') # Removes colons
    text = text.replace('"',' ') # Removes double quotation marks
    text = text.replace("'",' ') # Removes single quotation marks
    text = text.replace("”",' ') # Removes quotation marks
    text = text.replace("“",' ') # Removes quotation marks
    text = text.replace('/',' ') # Removes back-slashes
    text = text.replace('-',' ') # Removes dashes
    text = text.replace('—',' ') # Removes dashes
    text = text.replace('_',' ') # Removes underscores
    text = text.replace('$',' ') # Removes dollar signs
    text = text.replace('£',' ') # Removes symbol
    text = text.replace('0',' ') # Removes the digit 0
    text = text.replace('1',' ') # Removes the digit 1
    text = text.replace('2',' ') # Removes the digit 2
    text = text.replace('3',' ') # Removes the digit 3
    text = text.replace('4',' ') # Removes the digit 4
    text = text.replace('5',' ') # Removes the digit 5
    text = text.replace('6',' ') # Removes the digit 6
    text = text.replace('7',' ') # Removes the digit 7
    text = text.replace('8',' ') # Removes the digit 8
    text = text.replace('9',' ') # Removes the digit 9
    
    text_list = text.split()
    return text_list

In [188]:
def exclude(text_list):
    '''
    This function takes a list of words and eliminates any words that are found on the stop_words.csv file
    
    Arguments
    ---------
    list: list of words
    
    Return
    ------
    list: list of words that excludes any words from the stop_words.csv file
    '''
    final_list = [word for word in text_list if word not in stop_words_list]
    return final_list

In [189]:
def DTM(final_list):
    '''
    This function converts list of text into a document term matrix
    
    Arguments
    ---------
    list: list of words
    
    Return
    ------
    dataframe: pandas dataframe in the form of a document term matrix
    '''
    d = dict()
    for word in final_list:
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [190]:
def gen_DTM(texts=None):
    '''
    This function applies the functions tokenize(), exclude(), and DTM() to each news story
    
    Arguments
    ---------
    list: list of text for each news story
    
    Returns
    -------
    dataframe: pandas dataframe in the form of a document term matrix
    '''
    final_DTM = pd.DataFrame()
    
    for story in text:
        entry_a = tokenize(story)
        entry_b = exclude(entry_a)
        entry_c = DTM(entry_b)
        final_DTM = final_DTM.append(pd.DataFrame(entry_c),ignore_index=True,sort=True) # Row bind
    
    final_DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return final_DTM

In [191]:
def cosine(a,b):
    '''
    This function takes two numerical values and calculates the angle between the two vectors (cosine)
    
    Arguments
    ---------
    float: two floats that represent word counts
    
    Return
    ------
    float: float that represents the cosine calculation between the two inputted floats
    '''
    cos = np.dot(a,b) / (np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))
    return cos

In [196]:
# Run gen_DTM function on all five stories to tokenize the text and get counts of each word
df = gen_DTM(text)
df

Unnamed: 0,abdulaziz,absent,accident,accidentally,account,accounts,accusation,accusing,acknowledged,added,...,we’re,white,widely,withheld,woods,world,worse,writer,yalova,yelova
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0


In [197]:
# Index the dataframe, calculate cosine value between each pair of stories

# Aljazeera vs. BBC
a = df.iloc[0].values
b = df.iloc[1].values
albbc = round(cosine(a,b),4)

# Aljazeera vs. Breitbart
a = df.iloc[0].values
b = df.iloc[2].values
albr = round(cosine(a,b),4)

# Aljazeera vs. CNN
a = df.iloc[0].values
b = df.iloc[3].values
alcnn = round(cosine(a,b),4)

# Aljazeera vs. FOX
a = df.iloc[0].values
b = df.iloc[4].values
alfox = round(cosine(a,b),4)

# BBC vs. Breitbart
a = df.iloc[1].values
b = df.iloc[2].values
bbcbr = round(cosine(a,b),4)

# BBC vs. CNN
a = df.iloc[1].values
b = df.iloc[3].values
bbccnn = round(cosine(a,b),4)

# BBC vs. FOX
a = df.iloc[1].values
b = df.iloc[4].values
bbcfox = round(cosine(a,b),4)

# Breitbart vs. CNN
a = df.iloc[2].values
b = df.iloc[3].values
brcnn = round(cosine(a,b),4)

# Breitbart vs. FOX
a = df.iloc[2].values
b = df.iloc[4].values
brfox = round(cosine(a,b),4)

# CNN vs. FOX
a = df.iloc[3].values
b = df.iloc[4].values
cnnfox = round(cosine(a,b),4)

# Put calculated values into a dataframe to easily compare
# Intitialize data of lists 
cos_values = [{'Aljazeera': 1, 'BBC': albbc, 'Breitbart': albr, 'CNN': alcnn, 'FOX': alfox},
             {'Aljazeera': albbc, 'BBC': 1, 'Breitbart': bbcbr, 'CNN': bbccnn, 'FOX': bbcfox},
             {'Aljazeera': albr, 'BBC': bbcbr, 'Breitbart': 1, 'CNN': brcnn, 'FOX': brfox},
             {'Aljazeera': alcnn, 'BBC': bbccnn, 'Breitbart': brcnn, 'CNN': 1, 'FOX': cnnfox},
             {'Aljazeera': alfox, 'BBC': bbcfox, 'Breitbart': brfox, 'CNN': cnnfox, 'FOX': 1}]
  
# Index the dataframe
cos_df = pd.DataFrame(cos_values, index =['Aljazeera', 'BBC', 'Breitbart', 'CNN', 'FOX'])
  
# Print the data
print(cos_df)

           Aljazeera     BBC  Breitbart     CNN     FOX
Aljazeera     1.0000  0.7043     0.5868  0.6102  0.7161
BBC           0.7043  1.0000     0.5877  0.5518  0.6674
Breitbart     0.5868  0.5877     1.0000  0.3856  0.5553
CNN           0.6102  0.5518     0.3856  1.0000  0.5791
FOX           0.7161  0.6674     0.5553  0.5791  1.0000


# Conclusion
#### Method
After standardizing each news story by eliminating punctuation and the list of 'stop' words, I calculated the cosine value (using the dot product) for each possible pair of stories. The above dataframe (cos_df) shows each cosine value for the respective pairs of stories. A story evaluated against itself produces a cosine value of 1 (since it's exactly the same thing). Therefore, the higher the value we get from the cosine calculation, the more similar those two stories are. 

#### Background
For context, Aljazeera is the news source headquartered in the Middle East. BBC (the British Broadcasting Corporation) is the news source from the United Kingdom. Then there is Breitbart, which is a American far-right news source started by conservative commentator Andrew Breitbart. Two more competing news sources in America are CNN and FOX, where CNN is known for holding bias in favor of the Democratic party, while FOX is typically bias towards Republicans.

#### Observations
When comparing the resulting cosine values, we see that these five news sites report on similar stories in dissimilar ways. The most similarly reported pair of new stories is Aljazeera and FOX (0.7161), with Aljazeera and BBC (0.7043) as a close second. We expect the Middle East news source and FOX to be comparable, as both are right-wing and present stories through a biased lens. Additionally, Aljazeera and BBC are rather similar, maybe because they are both international news sources (not headquartered in the United States). CNN and Breitbart are the least similar stories at a cosine value of 0.3856. This is because Breitbart is very right-wing, while CNN is very left-wing. We'd expect the two to have opposing viewpoints and present stories in a way that represents those views. In conclusion, this assignment verifies that stories may be presented differently based on the source it comes from and the bias behind each broadcasting channel.