# PPOL564 | Data Science 1: Foundations | Coding discussion 04
####  Alvaro Altamirano Montoya

#### 1 : Import Libraries, set WD, and load files.

In [25]:
# 1.1 Importing required libraries
import numpy as np, pandas as pd, os

# 1.2 Set paths and read txt files
path1 = r'C:\Users\unily\Documents\Georgetown\PPOL 564 - Intro to Data Science\Coding discussions\4\texts'
os.chdir(path1) # Set WD
# Read all .txt files and save them into a dictionary
texts = {}
for path, subdirs, files in os.walk(path1):
    for name in files:
        f = open(name, 'r', encoding = 'utf-8', newline = '')
        texts[name] = f.readline()

# 1.3 Loading/defining stopwords
path2 = r'C:\Users\unily\Documents\Georgetown\PPOL 564 - Intro to Data Science\Coding discussions\4'
os.chdir(path2)
stopwords = pd.read_csv('stop_words.csv')['word'].tolist()
print(stopwords[23:32])

['always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody']


In [26]:
print(texts.keys())

dict_keys(['aljazeera-khashoggi.txt', 'bbc-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt'])


#### 2: Tokenization, DTM, and cosine functions

In [27]:
#### Tokenizer function
def tokenizeText(text = None):
    '''
    tokenizeText tokenizes the text and breaks up into single words and removes stopwords
    ----------------------------
    Args: 
    A string 'text' with default value as None
    ----------------------------
    Output:
    Word-Tokenized strings with stopwords removal
    '''
    tokens = text.lower().split()
    tokens = [tok for tok in tokens if tok not in stopwords]
    return tokens

#### Text to DTM function
def convert_text_to_dtm(txt):
    '''
    convert_text_to_dtm converts ingested text into a document term matrix.
    ----------------------------
    Args: 
    Applies Tokenization function to 'txt' document.
    ----------------------------
    Output:
    Term-frequency matrix
    '''
    d = dict()
    for word in tokenizeText(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

#### DTM function
def gen_DTM(texts=None):
    '''
    gen_DTM generates a document term matrix from a string input
    ----------------------------
    Args: 
    A string object
    ----------------------------
    Output:
    Dcoument term frequency matrix
    '''
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM 

#### Cosine distance function
def cosine(a,b):
    '''
    Calculates cosine distance
    ----------------------------
    Args: 
    A set of string documents in this case
    ----------------------------
    Output:
    Cosine distance coefficients
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b))  )
    return cos

#### Apply gen_DTM function on all texts

In [28]:
df = gen_DTM(texts.values())
df.index  = files
# Data wrangling
df = df.T.reset_index() 
df.columns.name = None
del df['index']

### Question: Does each news site report on these stories in a similar way?

#### They report the topic using similar observations, but with different language and focus.

In [29]:
# Renaming the columns
df = df.rename(columns = {'aljazeera-khashoggi.txt' : 'Aljazeera',
        'bbc-khashoggi.txt' : 'BBC', 'breitbart-khashoggi.txt' : 'Breitbart',
        'cnn-khashoggi.txt' : 'CNN', 'fox-khashoggi.txt' : 'FOX'})

corr_matrix = df.corr(method=cosine)
print(corr_matrix.round(2))

           Aljazeera   BBC  Breitbart   CNN   FOX
Aljazeera       1.00  0.61       0.50  0.50  0.59
BBC             0.61  1.00       0.51  0.46  0.54
Breitbart       0.50  0.51       1.00  0.33  0.47
CNN             0.50  0.46       0.33  1.00  0.48
FOX             0.59  0.54       0.47  0.48  1.00


### Question: Which news sites talk about the Khashoggi scandal in similar/dissimilar ways? 
#### According to the previous matrix, the BBC and Aljazeera's texts are the two most similar. On the other hand, CNN and Breitbart are the most dissimilar pair according to their cosine distances. 

In [30]:
#### Tokenizer function without removal of stopwords
def tokenizeText(text = None):
    '''
    Tonkenizes the text and breaks up into single words
    ----------------------------
    Args: 
    A string 'text' with default value as None
    ----------------------------
    Output:
    Word-Tokenized strings
    '''
    tokens = text.lower().split()
    return tokens

#### Apply gen_DTM function on all texts using NEW tokenization function (without stopwords)
df = gen_DTM(texts.values())
df.index  = files
# Data wrangling
df = df.T.reset_index() 
df.columns.name = None

# Renaming the columns
df = df.rename(columns = {'aljazeera-khashoggi.txt' : 'Aljazeera',
        'bbc-khashoggi.txt' : 'BBC', 'breitbart-khashoggi.txt' : 'Breitbart',
        'cnn-khashoggi.txt' : 'CNN', 'fox-khashoggi.txt' : 'FOX'})

corr_matrix = df.corr(method=cosine)
print(corr_matrix.round(2))

           Aljazeera   BBC  Breitbart   CNN   FOX
Aljazeera       1.00  0.86       0.82  0.73  0.83
BBC             0.86  1.00       0.89  0.74  0.89
Breitbart       0.82  0.89       1.00  0.69  0.87
CNN             0.73  0.74       0.69  1.00  0.74
FOX             0.83  0.89       0.87  0.74  1.00


### Question: If you change what words you remove, does the picture of similarity change?
#### Not removing the stopwords, as shown in this last correlation table results in higher-than-actual cosine similarities, given the share load of stopwords each .txt had.