# Coding Discussion No. 4
## Name: Sahithi Adari
### Date: 11/01/20

In [23]:
#Imported packages
import numpy as np
import pandas as pd
from string import punctuation
import requests

In [24]:
#Converted the csv file into a dataframe 
df_stopwords = pd.read_csv('stop_words.csv')

#Coverted the dataframe into a list
stopwords = df_stopwords['word'].values.tolist()

One note: I noticed when I was cleaning up the article text of punctuation marks, even if an article had quotation marks some of them were mapped differently than regular quotation marks. That is to say '“' was understood to be different than '"' by the computer. We can show that difference by running the following functions: `ord('“')` and `ord('"')`.

In [25]:
ord('“')

8220

In [4]:
ord('"')

34

In [10]:
#Created a function that will take a txt file and change it into a dataframe
def txt_to_dtm(txt):
    '''
    Takes a txt file and returns it as a dataframe. The first part of the function also does some basic text clean up
    by removing any punctuation found (using the 'string.punctuation' function), and removing any commonly found words
    from the 'stopwords' csv. The second part of the function tracks how many times a specific word shows up in the
    original txt file.

    Args:
        txt (txt file): a txt file of the article

    Returns:
        DataFrame: frame containing a tally of how often a unique word showed up in the article
    '''
    with open(txt) as file:
        article = file.read().lower().replace('“', '').replace('”', '').replace('’','').split() #Removed any unique charecters
        temp_article = [s.strip(punctuation) for s in article] #Removed any punctuation marks at the beginning and end of each word
        clean_article = [word for word in temp_article if word not in stopwords] #Removed any commonly used words
    d = dict()
    for letter in clean_article: #Counted how often a specific word shows up in the txt file
        if letter in d:
            d[letter][0] += 1
        else:
            d[letter] = [1]
    return pd.DataFrame(d)

In [11]:
#Created a function that coverts multiple txt articles in dataframes
def gen_DTM(*texts):
    '''
    Takes multiple txt files and returns it as a dataframe by passing through the 'txt_to_dtm' function and then appending
    the individual dataframe to an overall dataframe called 'DTM'.

    Args:
        *texts (txt files): multiple txt files

    Returns:
        DTM: a dataframe appeneded with all the individual dataframes generated from 'txt_to_dtm'
    '''
    DTM = pd.DataFrame()
    for a in texts:
        entry = txt_to_dtm(a) #Passed each article into the 'txt_to_dtm' function
        DTM = DTM.append(entry, ignore_index = True) #Appeneded the dataframe from 'txt_to_dtm' to 'DTM'
    DTM.fillna(0, inplace=True) #Filled in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

In [12]:
#Created a function that easily calculates the angle between two vectors
def cosine(a,b):
    '''
    A function that easily calculates the angle between two vectors.
    Args:
        a,b : two vectors of the same length

    Returns:
        cos: the angle between the two vectors
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))
    return cos

In [13]:
#Created a function that calcuate the similarity between a combination of different
def article_similar(*docs):
    '''
    Takes multiple txt files and calculates the similarity between the texts by passing the articles through the
    'txt_to_dtm', 'gen_DTM' functions first and then calcuates the cosine. This funtion also goes through all
    possible combinations of articles.

    Args:
        *docs (txt files): multiple txt files

    Returns:
        A printed label of what articles are being compared and the respective similarity between the two articles
    '''
    D = gen_DTM(*docs) #Passed *docs through 'gen_DTM' and set that equal to D
    for i in range(5):
        for j in range(i+1,5):
            print("Cosine similarity between", docs[i], "and", docs[j], "is",
                  round((cosine(D.iloc[i].values, D.iloc[j].values)), 4))

In [14]:
article_similar('bbc-khashoggi.txt', 'aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt')

Cosine similarity between bbc-khashoggi.txt and aljazeera-khashoggi.txt is 0.6951
Cosine similarity between bbc-khashoggi.txt and breitbart-khashoggi.txt is 0.5813
Cosine similarity between bbc-khashoggi.txt and cnn-khashoggi.txt is 0.5205
Cosine similarity between bbc-khashoggi.txt and fox-khashoggi.txt is 0.6506
Cosine similarity between aljazeera-khashoggi.txt and breitbart-khashoggi.txt is 0.5825
Cosine similarity between aljazeera-khashoggi.txt and cnn-khashoggi.txt is 0.5329
Cosine similarity between aljazeera-khashoggi.txt and fox-khashoggi.txt is 0.6778
Cosine similarity between breitbart-khashoggi.txt and cnn-khashoggi.txt is 0.3664
Cosine similarity between breitbart-khashoggi.txt and fox-khashoggi.txt is 0.547
Cosine similarity between cnn-khashoggi.txt and fox-khashoggi.txt is 0.5153


##### Use what we know about (a) reading in text files, (b) data manipulation, and (c) linear algebra to analyze the difference between these documents. Does each news site report on these stories in a similar way? Which news sites talk about the Khashoggi scandal in similar/dissimilar ways? If you change what words you remove, does the picture of similarity change?

As the *article_similar* function shows, the 5 different news sources moderately report on the Khashoggi scandal in the same way. The greatest similarity was found between Al Jazeera & BBC coming in at $0.6951$; with Al Jazeera & Fox at $0.6778$ ; and BBC & Fox at $0.6506$ if we remove all common words and punctuations from the text. The news organizations that were the most dissimilar were Breitbart & CNN at $0.3664$.

But what happens if we leave the common words in the articles instead of removing them? We can do this simply by commenting out a line of code (which I’ve reproduced down below), and making slight tweaks to that same function.

In [None]:
#Same function as above but with commented out code to not remove common words
def txt_to_dtm(txt):
    with open(txt) as file:
        article = file.read().lower().replace('“', '').replace('”', '').replace('’','').split()
        temp_article = [s.strip(punctuation) for s in article]
        #clean_article = [word for word in temp_article if word not in stopwords] 
    d = dict()
    for letter in temp_article: #Changed 'clean_article' to 'temp_article' here
        if letter in d:
            d[letter][0] += 1
        else:
            d[letter] = [1]
    return pd.DataFrame(d)

In [None]:
article_similar('bbc-khashoggi.txt', 'aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt')

When we include common words back into the measurement of similarity we can see that similarity between all 5 news organizations shot up. Most notably, Breitbart & CNN shot up to $0.6793$. This makes sense as the cosine function is simply calculating the "difference of angles" between 2 vectors. The vectors here represent the frequency by which certain words show up; once we include common words back into the measurement, it's only natural that the measurement of similarity would increase.

What happens when we exclude the stripping of punctuation (save for the unique characters) from the texts?

In [None]:
#Same function as the original but with commented out code to not remove punctuation
def txt_to_dtm(txt):
    with open(txt) as file:
        article = file.read().lower().replace('“', '').replace('”', '').replace('’','').split()
        #temp_article = [s.strip(punctuation) for s in article]
        clean_article = [word for word in article if word not in stopwords] #Changed 'temp_article' to 'article' here
    d = dict()
    for letter in clean_article:
        if letter in d:
            d[letter][0] += 1
        else:
            d[letter] = [1]
    return pd.DataFrame(d)

In [None]:
article_similar('bbc-khashoggi.txt', 'aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt')

Once we leave punctuation in the articles sink back down to original levels. This goes to show that while removing punctuation helps generate a closer estimate in terms of similarity it doesn’t have to be necessary.

Lastly, what happens if we add "turkey", "khashoggi" and "erdogan" to our common words list? We can do that by adding those 3 values to *df_stopwords* and resaving it as a new dataframe.

In [20]:
#Converted the csv file into a dataframe
stopwords_turkey = df_stopwords.append({'word': 'turkey', 'word': 'khashoggi', 'word': 'erdogan'}, ignore_index = True)

#Coverted the dataframe into a list
stopwords = stopwords_turkey['word'].values.tolist()

In [21]:
#The original function
def txt_to_dtm(txt):
    with open(txt) as file:
        article = file.read().lower().replace('“', '').replace('”', '').replace('’','').split()
        temp_article = [s.strip(punctuation) for s in article]
        clean_article = [word for word in temp_article if word not in stopwords]
    d = dict()
    for letter in clean_article:
        if letter in d:
            d[letter][0] += 1
        else:
            d[letter] = [1]
    return pd.DataFrame(d)

In [22]:
article_similar('bbc-khashoggi.txt', 'aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt')

Cosine similarity between bbc-khashoggi.txt and aljazeera-khashoggi.txt is 0.6755
Cosine similarity between bbc-khashoggi.txt and breitbart-khashoggi.txt is 0.5639
Cosine similarity between bbc-khashoggi.txt and cnn-khashoggi.txt is 0.4843
Cosine similarity between bbc-khashoggi.txt and fox-khashoggi.txt is 0.6331
Cosine similarity between aljazeera-khashoggi.txt and breitbart-khashoggi.txt is 0.564
Cosine similarity between aljazeera-khashoggi.txt and cnn-khashoggi.txt is 0.4666
Cosine similarity between aljazeera-khashoggi.txt and fox-khashoggi.txt is 0.6133
Cosine similarity between breitbart-khashoggi.txt and cnn-khashoggi.txt is 0.3296
Cosine similarity between breitbart-khashoggi.txt and fox-khashoggi.txt is 0.532
Cosine similarity between cnn-khashoggi.txt and fox-khashoggi.txt is 0.4297


Even if we add those 3 values to the common words list there doesn't seem to be that big of a difference between the 5 news organization.

The "biggest" difference we do see is between Al Jazeera & CNN and CNN & Fox. Where as before (removing punctuation and commons words pre-addition of the three above) the similarity between Al Jazeera & Fox was at $0.5329$, that value drops to $0.4666$. For CNN & Fox this value goes from $0.5153$ to $0.4297$.