# Coding Discussion 4
## Data Science I - PPOL 564
Asif Bhatti - 
Sun, November 7

In [1]:
# Setup and import of modules 
import numpy as np
import pandas as pd

### A) Load in the all data to text read 

In [2]:
# Read Al-Jazeera 
with open('../Data/aljazeera-khashoggi.txt') as f:
    aljaz = f.read()

# Read BBC
with open('../Data/BBC-khashoggi.txt') as f:
    bbc = f.read()

# Read Breitbart
with open('../Data/breitbart-khashoggi.txt', encoding='utf8') as f:
    breit = f.read()

# Read Fox
with open('../Data/fox-khashoggi.txt', encoding='utf8') as f:
    fox = f.read()

# Read CNN
with open('../Data/cnn-khashoggi.txt') as f:
    cnn = f.read()

In [3]:
# convert the stop-words csv to a list (like for like file type)
stopwords = pd.read_csv("../Data/stop_words.csv")
stopwords2 = stopwords['word'].to_list()

## B.1) Develop key functions needed to transform data

In [4]:
def tokenize(text=None):
    """
    tokenize function converts text into a list of words while removing non-letter characters.
    Input: string
    Return: a list of strings
    """
    text = text.lower()
    text = text.replace('.',' ')
    text = text.replace(',', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace("’", ' ')
    text = text.replace("'", ' ')
    text = text.replace(':', ' ')
    text = text.replace('“', ' ')
    text = text.replace('”', ' ')
    text = text.replace('—', ' ')
    text = text.replace('"', ' ')
    text = text.replace('[', ' ')
    text = text.replace(']', ' ')
    text = text.replace('?', ' ')
    text = text.replace('!',' ')
    text = text.replace('/',' ')
    text = text.replace('\\', ' ')
    text = text.replace('-', ' ')
    text = text.replace('$', '')
    text = text.replace('£', '')
    text = text.replace('0', '')
    text = text.replace('1', '')
    text = text.replace('2', '')
    text = text.replace('3', '')
    text = text.replace('4', '')
    text = text.replace('5', '')
    text = text.replace('6', '')
    text = text.replace('7', '')
    text = text.replace('8', '')
    text = text.replace('9', '')
    text_list = text.split()
    return text_list

In [5]:
def convert_text_to_dtm(txt):
    '''
    Converts text into a document term matrix
    Input: text strings
    Returns: dataframe 
    '''
    d = dict()
    for word in tokenize(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [6]:
def gen_DTM(texts=None):
    '''
    Generate a document term matrix
    Input: iterable object
    Return: data frame array
    '''
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

In [7]:
def similarity(x,y):
    """
    cosine_fun takes two arrays and finds the cosine (i.e., similarity)
    Input: two arrays
    Return: cosine value of the items in array
    """
    cos= (np.dot(x,y)/( np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y))))
    return cos

## B.2) Manipulate actual dataset and run the transformations 
*WITHOUT REMOVING STOPWORDS*

In [8]:
df = gen_DTM([aljaz, bbc, breit, fox, cnn])
df # return / show document term martix for each source. 

Unnamed: 0,a,abdulaziz,able,about,absent,accident,accidentally,according,account,accounts,...,working,world,worse,would,writer,yalova,year,yelova,yet,your
0,11,0.0,0.0,2,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,0.0,0.0,2,1.0,0.0,0.0,1.0,1.0,2.0,...,0.0,1.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,3.0
2,11,2.0,0.0,2,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,16,0.0,0.0,4,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,2.0,2.0,0.0,0.0,1.0,0.0,0.0
4,14,0.0,1.0,1,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [9]:
# New dictionary for each article - key is article source, value is count of values from above table
dict = {
'aljaz':df.iloc[0].values,
'bbc':df.iloc[1].values,
'breit':df.iloc[2].values,
'cnn':df.iloc[3].values,
'fox':df.iloc[4].values
}

In [10]:
# convert dict into a pandas dataframe
con_df = pd.DataFrame(dict)

In [11]:
# Identify the correlation / similarity of the articles using the similarity function defined above
similarity = con_df.corr(method=similarity)

# displaying dataframe as an heatmap 
# with diverging colourmap as virdis
similarity.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '20px'})

Unnamed: 0,aljaz,bbc,breit,cnn,fox
aljaz,1.0,0.874271,0.841171,0.847628,0.765426
bbc,0.874271,1.0,0.900876,0.891472,0.759528
breit,0.841171,0.900876,1.0,0.86925,0.703809
cnn,0.847628,0.891472,0.86925,1.0,0.768582
fox,0.765426,0.759528,0.703809,0.768582,1.0


##### Reflection
In absence of removing the stopwords it is hard to discern differences between articles. Though, Fox does have the least similarity across all other four sources with cosine values in the mid to low 70s.  

## C.1) Develop key functions needed to transform data 
*REMOVING STOPWORDS*

In [12]:
# SAME function as above with the added removal of stopwords
def tokenize_STOP(text=None):
    """
    tokenize function converts text into a list of words while removing non-letter characters.
    Input: string
    Return: a list of strings
    """
    text = text.lower()
    text = text.replace('.',' ')
    text = text.replace(',', ' ')
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace("’", ' ')
    text = text.replace("'", ' ')
    text = text.replace(':', ' ')
    text = text.replace('“', ' ')
    text = text.replace('”', ' ')
    text = text.replace('—', ' ')
    text = text.replace('"', ' ')
    text = text.replace('[', ' ')
    text = text.replace(']', ' ')
    text = text.replace('?', ' ')
    text = text.replace('!',' ')
    text = text.replace('/',' ')
    text = text.replace('\\', ' ')
    text = text.replace('-', ' ')
    text = text.replace('$', '')
    text = text.replace('£', '')
    text = text.replace('0', '')
    text = text.replace('1', '')
    text = text.replace('2', '')
    text = text.replace('3', '')
    text = text.replace('4', '')
    text = text.replace('5', '')
    text = text.replace('6', '')
    text = text.replace('7', '')
    text = text.replace('8', '')
    text = text.replace('9', '')
    text_list = text.split()
    text_list_STOP = [word for word in text_list if word not in stopwords2]
    return text_list_STOP

In [13]:
# SAME function as above with added removal of stop functions
def convert_text_to_dtm_STOP(txt):
    '''
    Converts text into a document term matrix
    Input: text strings
    Returns: dataframe
    '''
    d = {}
    for word in tokenize_STOP(txt):
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

In [14]:
# SAME function as above with added removal of stop functions
def gen_DTM_STOP(texts=None):
    '''
    Generate a document term matrix
    Input: iterable object
    Return: data frame array
    '''
    DTM = pd.DataFrame()
    for text in texts:
        entry = convert_text_to_dtm_STOP(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM

## C.2) Manipulate actual dataset and run the transformations 
*REMOVING STOPWORDS*

In [15]:
df_rmv = gen_DTM_STOP([aljaz, bbc, breit, fox, cnn])
df_rmv # return / show document term martix for each source. 

Unnamed: 0,abdulaziz,absent,accident,accidentally,account,accounts,accusation,accusing,acknowledged,added,...,weeks,white,widely,withheld,woods,world,worse,writer,yalova,yelova
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# SAME AS ABOVE - New dictionary for each article - key is article source, value is count of values from above table
dict_2 = {
'aljaz':df_rmv.iloc[0].values,
'bbc':df_rmv.iloc[1].values,
'breit':df_rmv.iloc[2].values,
'cnn':df_rmv.iloc[3].values,
'fox':df_rmv.iloc[4].values
}

In [17]:
# SAME AS ABOVE - convert dict into a pandas dataframe
df_rmv = pd.DataFrame(dict_2)

In [18]:
# Bringing back this function to define again
def similarity(x,y):
    """
    cosine_fun takes two arrays and finds the cosine (i.e., similarity)
    Input: two arrays
    Return: cosine value of the items in array
    """
    cos= (np.dot(x,y)/( np.sqrt(np.dot(x,x)) * np.sqrt(np.dot(y,y))))
    return cos

In [19]:
# SAME AS ABOVE - Identify the correlation / similarity of the articles using the similarity function defined above
similarity_rmv = df_rmv.corr(method = similarity)

# displaying dataframe as an heatmap 
# with diverging colourmap as virdis
similarity_rmv.style.background_gradient(cmap ='viridis')\
        .set_properties(**{'font-size': '20px'})

Unnamed: 0,aljaz,bbc,breit,cnn,fox
aljaz,1.0,0.704258,0.601929,0.721042,0.610199
bbc,0.704258,1.0,0.622518,0.69496,0.551774
breit,0.601929,0.622518,1.0,0.582348,0.42462
cnn,0.721042,0.69496,0.582348,1.0,0.608453
fox,0.610199,0.551774,0.42462,0.608453,1.0


In removing the stopwords we can see more clear seperation between the sources of the article. Where previously, only Fox was really distinguishable from the others now Breitbart is more different than the others as well.

What is interesting is that fox and breitbart are not as similar as Al-Jazeera and CNN are. 

Nonetheless, a more detailed anaylsis would be necessary (e.g., sentiment)