# Coding Discussion Four

### Imports

In [50]:
import pandas as pd
import numpy as np
from numpy import linalg as LA
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Load Data

In [51]:
# Open Text Files
p1 = open("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/aljazeera-khashoggi.txt", "r", encoding="utf8")
p2 = open("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/bbc-khashoggi.txt", "r")
p3 = open("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/breitbart-khashoggi.txt", "r", encoding="utf8")
p4 = open("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/cnn-khashoggi.txt", "r")
p5 = open("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/fox-khashoggi.txt", "r", encoding="utf8")

# Read File and store into variables
txt1 = p1.read()
txt2 = p2.read()
txt3 = p3.read()
txt4 = p4.read()
txt5 = p5.read()

In [160]:
# Store Each articles company into dictionary
names = {}
for i in range(1,6):
    names["p" + str(i) + "_name"] = eval("p" + str(i)).name.split('/')[-1].split('-')[0]

{'p1_name': 'aljazeera',
 'p2_name': 'bbc',
 'p3_name': 'breitbart',
 'p4_name': 'cnn',
 'p5_name': 'fox'}

In [220]:
# Two types of stop words second is adding nuetral words
stop_words = pd.read_csv("coding_discussions_ppol564_fall2021/04_coding_discussion/Data/stop_words.csv")
stop_words_extended = stop_words['word'].tolist() + ["khashoggi", "saudi", "erdogan", "turkish", "arabia", "istanbul", "salman"]

In [221]:
# Create CV using different stop words
CountVec = CountVectorizer(stop_words = stop_words['word'].tolist())
CountVec2 = CountVectorizer(stop_words = stop_words_extended, min_df=1)

# Fit data
Count_data = CountVec.fit_transform([txt1, txt2,txt3,txt4,txt5])
Count_data_2 = CountVec2.fit_transform([txt1,txt2,txt3,txt4,txt5])

In [235]:
# First CV
sum_words = Count_data.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in CountVec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
print("The five most common words across all texts are: ")
print(words_freq[:5])

The five most common words across all texts are: 
[('saudi', 56), ('erdogan', 49), ('khashoggi', 47), ('consulate', 24), ('turkish', 23)]


In [237]:
# Second CV
sum_words_2 = Count_data_2.sum(axis=0)
words_freq_2 = [(word, sum_words_2[0, idx]) for word, idx in CountVec2.vocabulary_.items()]
words_freq_2 = sorted(words_freq_2, key = lambda x: x[1], reverse=True)
print("If we remove the most common neutral words we find that the most common words are:")
print(words_freq_2[:5])

If we remove the most common neutral words we find that the most common words are:
[('consulate', 24), ('murder', 22), ('president', 21), ('killing', 21), ('speech', 13)]


In [228]:
# Calculate similarity scores
def sim_scores(a, b):
    
    """
    Args: (a,b) which are both the index of the article you want to compare
    Return: The cosine similarity score is retured
    """
    
    a = a - 1
    b = b - 1
    a_data = Count_data[a,:]
    b_data = Count_data[b,:]
    numerator = int(np.dot(a_data.todense(), b_data.todense().transpose()))
    denom = LA.norm(a_data.todense()) * LA.norm(b_data.todense())
    return numerator / denom

def sim_scores_2(a, b):
    
    """
    Args: (a,b) which are both the index of the article you want to compare
    Function: Uses the extended stop words CV
    Return: The cosine similarity score is retured
    """
    
    a = a - 1
    b = b - 1
    a_data = Count_data_2[a,:]
    b_data = Count_data_2[b,:]
    numerator = int(np.dot(a_data.todense(), b_data.todense().transpose()))
    denom = LA.norm(a_data.todense()) * LA.norm(b_data.todense())
    return numerator / denom

In [230]:
# Create combination of articles to compare similarity
combs = list(combinations(range(1,6),2))
df = pd.DataFrame(columns = ["source_a", "source_b", "similarity", "similarity_removed", "diff"])

# Add values to df
for c in combs:
    v1 = list(names.values())[c[0] - 1]
    v2 = list(names.values())[c[1] - 1]
    v3 = sim_scores(c[0], c[1])
    v4 = sim_scores_2(c[0], c[1])
    df.loc[len(df)] = [v1,v2,v3,v4, v3-v4]
df

Unnamed: 0,source_a,source_b,similarity,similarity_removed,diff
0,aljazeera,bbc,0.704288,0.484164,0.220124
1,aljazeera,breitbart,0.600776,0.416694,0.184082
2,aljazeera,cnn,0.608624,0.311458,0.297166
3,aljazeera,fox,0.719294,0.502585,0.216709
4,bbc,breitbart,0.621047,0.476368,0.144678
5,bbc,cnn,0.550651,0.265059,0.285592
6,bbc,fox,0.692978,0.479765,0.213213
7,breitbart,cnn,0.421972,0.168846,0.253126
8,breitbart,fox,0.581209,0.429816,0.151393
9,cnn,fox,0.605116,0.269107,0.336009


### Findings

From the above dataframe we can see that when we leave the stop words to be the original words that the articles that seem to be most similar are between Aljazeera and Fox, Aljazeera and BBC. However, the reason I was a little skeptical about these findings are they still use words that are neutral such as names and the country. There I hypothesized that if I removed these words then we would find more meaningful results to describe the similarities between articles. If we look at those results we see that Aljazeera and Fox, Aljazeera and BBC are still in the top of the chart, however another grouping also becomes more prevalant in the BBC and Breitbart, BBC and Fox articles. To further analysis we could also do larger grouping of words.   