# PPOL564 | Data Science 1: Foundations 

## Coding Discussion 4



#### By      : Sonali Subbu Rathinam 
#### NetID : ss4608
#### Date   : 06/11/2021

In [1]:
#Importing the required modules
import pandas as pd
import numpy as np

#### Function definitions of all the functions used in the notebook

In [2]:
def tokenize(text=None, choice=0):
    """
    tokenize function splits the string passed as argument after performing some modifications - changing all to lower case,
    and removing certain punctuations. Then the string is being split as well. 
    ------------------------------------------------------------------------------------------------------------------------
    Arguments:
    A string that is given the local name text. It's default value is None. 
    Choice, which indicates if stop words will be used or not.Its default value is 0, which is no use of stop-words list
    ------------------------------------------------------------------------------------------------------------------------
    Return Value:
    Returning the modified string
    ------------------------------------------------------------------------------------------------------------------------   
    """
    
    #Changing the string to lower case completely
    text = text.lower()
    
    #Changing certaing punctuations
    text = text.replace('.','')
    text = text.replace('\"','')  
    text = text.replace('\'','')  
    text = text.replace('?','')  
    text = text.replace('(','')
    text = text.replace(')','')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.replace('“','')
    text = text.replace('”','')
    text = text.replace(',', '')
    
    #Splitting the string
    text_list = text.split()
    
    if(choice==0):
        text_list_final = text_list
    
    #Not including stop words in the modified string
    if(choice==1):
        text_list2 = [word for word in text_list if word not in stop_words_list]
        text_list_final = text_list2

    #Returning the string
    return text_list_final

In [3]:
def convert_text_to_dtm(txt, choice=0):
    """
    convert_text_to_dtm will convert the string passed as an argument into a document-term matrix. It counts the number of 
    times a word is present in the string passed as argument and returns the result as a dataframe.
    ------------------------------------------------------------------------------------------------------------------------
    Arguments:
    A string that is given the local name txt. 
    Choice, which indicates if stop words will be used or not. Its default value is 0, which is no use of stop-words list
    ------------------------------------------------------------------------------------------------------------------------
    Return value:
    Returning a dataframe, which is the document term matrix of the string
    ------------------------------------------------------------------------------------------------------------------------
    """
    
    #Initialising empty dictionary
    d = dict()
    
    #For loop to count the number of times a word is present in a string
    for word in tokenize(txt, choice):  #Here we are calling the tokenize function so that the string is modified 
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
            
    #Returning the result as a dataframe        
    return pd.DataFrame(d)

In [4]:
def gen_DTM(texts=None, choice=0):
    """
    gen_DTM will create a document term matrix for all the strings present in the argument.
    ------------------------------------------------------------------------------------------------------------------------
    Arguments: 
    A list of strings, which is given the local name texts. Its default value is None. 
    Choice, which indicates if stop words will be used or not. Its default value is 0, which is no use of stop-words list
    ------------------------------------------------------------------------------------------------------------------------
    Return value:
    Returning a Dataframe, which is the document term matrix of all the strings. 
    
    """
    
    #Initializing empty Dataframe
    DTM = pd.DataFrame()
    
    #For loop to create document term matrix
    for text in texts:
        # We are calling convert_text_to_dtm function so that we get the document term matrix of a particular string
        entry = convert_text_to_dtm(text, choice) 
        #Appending each result to the dataframe initialized earlier
        DTM = DTM.append(entry,ignore_index=True, sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    
    #Returning the dataframe
    return DTM

In [5]:
def cosine(a,b):
    """
    cosine function calculates the cos value of the angle between the vectors. It is an indication of how similar the two 
    strings are. 
    ------------------------------------------------------------------------------------------------------------------------
    Arguments:
    Two strings, that are locally named a and b
    ------------------------------------------------------------------------------------------------------------------------
    Return value:
    Returning the result, which the cos value between the two strings as vectors
    -----------------------------------------------------------------------------------------------------------------------
    """
    
    #Calculating the cos
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))
    
    #Returning the result
    return cos

In [6]:
#Reading all the news articles

#Reading Al-Jazeera
with open("C:/Users/sonal/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/aljazeera-khashoggi.txt") as f:
    al_jazeera = f.read()
    
#Reading BBC
with open("C:/Users/sonal/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/bbc-khashoggi.txt") as f:
    bbc = f.read()
    
#Reading Breitbart
with open("C:/Users/sonal/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/breitbart-khashoggi.txt",encoding='utf8', errors='ignore') as f:
    breitbart = f.read()
    
#Reading CNN
with open("C:/Users/sonal/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/cnn-khashoggi.txt",encoding='utf8', errors='ignore') as f:
    cnn = f.read()
    
#Reading Fox
with open("C:/Users/sonal/coding_discussions_ppol564_fall2021/04_coding_discussion/Data/fox-khashoggi.txt",encoding='utf8', errors='ignore') as f:
    fox = f.read()

In [7]:
#Calling gen_DTM function and saving the result in dtm_news
dtm_news=gen_DTM([al_jazeera,bbc,breitbart, cnn, fox], 0)

In [8]:
#Checking the result
print(dtm_news)

   $50bn    -    1   11   12   15  15-member   18  1:08    2  ...  world  \
0    0.0  1.0  0.0  0.0  0.0  0.0        1.0  2.0   0.0  2.0  ...    0.0   
1    1.0  2.0  0.0  0.0  1.0  0.0        0.0  2.0   0.0  1.0  ...    1.0   
2    0.0  0.0  0.0  0.0  0.0  1.0        0.0  0.0   0.0  0.0  ...    1.0   
3    0.0  0.0  0.0  0.0  0.0  0.0        0.0  1.0   0.0  0.0  ...    0.0   
4    0.0  0.0  1.0  1.0  0.0  1.0        0.0  1.0   1.0  1.0  ...    0.0   

   worse  would  writer  yalova  yelova  yet  your  â£385bn    —  
0    0.0    1.0     0.0     0.0     0.0  0.0   0.0      0.0  0.0  
1    0.0    4.0     0.0     0.0     0.0  0.0   3.0      1.0  0.0  
2    0.0    1.0     0.0     0.0     0.0  1.0   0.0      0.0  0.0  
3    0.0    0.0     0.0     1.0     0.0  1.0   0.0      0.0  0.0  
4    1.0    2.0     2.0     0.0     1.0  0.0   0.0      0.0  3.0  

[5 rows x 895 columns]


In [9]:
#Obtaining the values of each news article to calculate the angle between vectors

al_jazeera1 = dtm_news.iloc[0].values
bbc1 = dtm_news.iloc[1].values
breitbart1 = dtm_news.iloc[2].values
cnn1 = dtm_news.iloc[3].values
fox1 = dtm_news.iloc[4].values

In [10]:
#Saving the values in a list
listnews = [al_jazeera1, bbc1, breitbart1, cnn1, fox1]

In [11]:
#Initializing an empty list
cos_list = []

#For loop to calculate the cos value of angle between strings and appending it to the list initialised in previous step
for i in range(0,5):
    
    #Initialising an empty temporary list
    temp_list=[]
    
    for j in range(0,5):
        
        #Calculating the cos between two strings and saving the result in temp
        temp = cosine(listnews[i], listnews[j])
        #Appending each result in temp_list
        temp_list.append(temp)
    
    #Appending the temp_list values in cos_list to obtain each row of correlation matrix
    cos_list.append(temp_list)

In [12]:
#Converting cos_list into a dataframe
cos_table = pd.DataFrame(cos_list)

In [13]:
#Printing the correlation matrix
display(cos_table)

Unnamed: 0,0,1,2,3,4
0,1.0,0.870127,0.836452,0.734566,0.841548
1,0.870127,1.0,0.896589,0.743944,0.886466
2,0.836452,0.896589,1.0,0.678944,0.867174
3,0.734566,0.743944,0.678944,1.0,0.736611
4,0.841548,0.886466,0.867174,0.736611,1.0


In [14]:
#Changing the index of the correlation matrix
cos_table = cos_table.rename(index={0: 'Al Jazeera', 1: 'BBC', 2: 'Breitbart', 3: 'CNN', 4:'Fox'})

#Changing the column names
cos_table = cos_table.rename(columns = {0: "Al Jazeera", 1: "BBC", 2: 'Breitbart', 3: 'CNN', 4:'Fox'  })

In [15]:
#Chencking the result
display(cos_table)

Unnamed: 0,Al Jazeera,BBC,Breitbart,CNN,Fox
Al Jazeera,1.0,0.870127,0.836452,0.734566,0.841548
BBC,0.870127,1.0,0.896589,0.743944,0.886466
Breitbart,0.836452,0.896589,1.0,0.678944,0.867174
CNN,0.734566,0.743944,0.678944,1.0,0.736611
Fox,0.841548,0.886466,0.867174,0.736611,1.0


From the above result, it can be observed that the highest similarity is between BBC and Breitbart, which is followed by BBC and Fox. The least similarity is between CNN and Breitbart

#### Performing the same tasks after removing the stop-words

In [16]:
#Reading the stop-words file 
stop_words=pd.read_csv("stop_words.csv")

print(stop_words)

         word
0           a
1         a's
2        able
3       about
4       above
..        ...
723      year
724     years
725     young
726   younger
727  youngest

[728 rows x 1 columns]


In [17]:
#Convert stop_words dataframe into a list

stop_words_list = stop_words['word'].values.tolist()

In [18]:
#Calling the previous functions, but this time the choice argument value is 1. 1 Means stop words list will be used
dtm_news_stopwords=gen_DTM([al_jazeera,bbc,breitbart, cnn, fox], 1)

In [19]:
#Checking the result
print(dtm_news_stopwords)

   $50bn    -    1   11   12   15  15-member   18  1:08    2  ...  widely  \
0    0.0  1.0  0.0  0.0  0.0  0.0        1.0  2.0   0.0  2.0  ...     0.0   
1    1.0  2.0  0.0  0.0  1.0  0.0        0.0  2.0   0.0  1.0  ...     1.0   
2    0.0  0.0  0.0  0.0  0.0  1.0        0.0  0.0   0.0  0.0  ...     0.0   
3    0.0  0.0  0.0  0.0  0.0  0.0        0.0  1.0   0.0  0.0  ...     0.0   
4    0.0  0.0  1.0  1.0  0.0  1.0        0.0  1.0   1.0  1.0  ...     0.0   

   withheld  woods  world  worse  writer  yalova  yelova  â£385bn    —  
0       0.0    0.0    0.0    0.0     0.0     0.0     0.0      0.0  0.0  
1       0.0    0.0    1.0    0.0     0.0     0.0     0.0      1.0  0.0  
2       1.0    0.0    1.0    0.0     0.0     0.0     0.0      0.0  0.0  
3       0.0    0.0    0.0    0.0     0.0     1.0     0.0      0.0  0.0  
4       0.0    1.0    0.0    1.0     2.0     0.0     1.0      0.0  3.0  

[5 rows x 670 columns]


In [20]:
#Obtaining the values of each news article to calculate the angle between vectors

al_jazeera2 = dtm_news_stopwords.iloc[0].values
bbc2 = dtm_news_stopwords.iloc[1].values
breitbart2 = dtm_news_stopwords.iloc[2].values
cnn2 = dtm_news_stopwords.iloc[3].values
fox2 = dtm_news_stopwords.iloc[4].values

In [21]:
#Saving the values in a list
listnews2 = [al_jazeera2, bbc2, breitbart2, cnn2, fox2]

In [22]:
#Initializing an empty list
cos_list2 = []

#For loop to calculate the cos value of angle between strings and appending it to the list initialised in previous step
for i in range(0,5):
    
    #Initialising an empty temporary list
    temp_list2=[]
    
    for j in range(0,5):
        
        #Calculating the cos between two strings and saving the result in temp
        temp2 = cosine(listnews2[i], listnews2[j])
        #Appending each result in temp_list
        temp_list2.append(temp2)
    
    #Appending the temp_list values in cos_list to obtain each row of correlation matrix
    cos_list2.append(temp_list2)

In [23]:
#Converting cos_list2 into a dataframe
cos_table2 = pd.DataFrame(cos_list2)

In [24]:
#Printing the correlation matrix
display(cos_table2)

Unnamed: 0,0,1,2,3,4
0,1.0,0.677223,0.583719,0.532865,0.679414
1,0.677223,1.0,0.578985,0.503408,0.626283
2,0.583719,0.578985,1.0,0.367165,0.547029
3,0.532865,0.503408,0.367165,1.0,0.517322
4,0.679414,0.626283,0.547029,0.517322,1.0


In [25]:
#Changing the index of the correlation matrix
cos_table2 = cos_table2.rename(index={0: 'Al Jazeera', 1: 'BBC', 2: 'Breitbart', 3: 'CNN', 4:'Fox'})

#Changing the column names
cos_table2 = cos_table2.rename(columns = {0: "Al Jazeera", 1: "BBC", 2: 'Breitbart', 3: 'CNN', 4:'Fox'  })

In [26]:
#Checking the result
display(cos_table2)

Unnamed: 0,Al Jazeera,BBC,Breitbart,CNN,Fox
Al Jazeera,1.0,0.677223,0.583719,0.532865,0.679414
BBC,0.677223,1.0,0.578985,0.503408,0.626283
Breitbart,0.583719,0.578985,1.0,0.367165,0.547029
CNN,0.532865,0.503408,0.367165,1.0,0.517322
Fox,0.679414,0.626283,0.547029,0.517322,1.0


From the above result, we can see that after using stop words, all the cosine values have dropped. Now, the highest similarity is between Al Jazeera and Fox, which is closely followed by Al Jazeera and BBC. The least similarity is between Breitbart and CNN.