In [1]:
import string
import numpy as np
import pandas as pd
import requests

### Calculate the similarities without stopwords

In [2]:
#load the stop words and make them a list.
stop_word = pd.read_csv("stop_words.csv")
stop_word_list = list(stop_word["word"])

In [3]:
def get_text_txt(address):
    '''
    This function takes a txt file's directory as input and outputs a list that contains each word of the text.
    
    Arguements
    ----------
    address: a txt file's directory.
    
    Return
    ------
    a list that contains each word of the txt file.
    '''
    text_as_list = []#create a new list.
    with open(address) as f:#append each lineofnthe file to the list.
        for line in f:
            text_as_list.append(line)
    text_as_list = text_as_list[0].lower().translate(str.maketrans('', '', string.punctuation)).split()#make the content to lower case, remove all punctuation and split to single word.
    text_as_list2 = [word for word in text_as_list if word not in stop_word_list]#remove all the stop words.
    return text_as_list2

get_text_txt('aljazeera-khashoggi.txt')#test run

['turkey',
 'istanbul',
 'turkish',
 'president',
 'recep',
 'tayyip',
 'erdogan',
 'murder',
 'journalist',
 'jamal',
 'khashoggi',
 'kingdoms',
 'consulate',
 'istanbul',
 'planned',
 'saudi',
 'officials',
 'days',
 'advance',
 'addressing',
 'legislators',
 'justice',
 'development',
 'party',
 'ak',
 'party',
 'tuesday',
 'erdogan',
 'detailed',
 'khashoggis',
 'disappearance',
 'murder',
 'stopped',
 'short',
 'accusing',
 'saudi',
 'royals',
 'savage',
 'killing',
 'caused',
 'global',
 'outrage',
 'september',
 '28',
 'khashoggi',
 'arrived',
 'saudi',
 'arabian',
 'consulate',
 'sort',
 'wedding',
 'paperwork',
 'erdogan',
 'speech',
 'turkish',
 'parliament',
 'capital',
 'ankara',
 'time',
 'saudi',
 'arabian',
 'officials',
 'started',
 'plan',
 'roadmap',
 'murder',
 'added',
 'saudi',
 'officials',
 'left',
 'turkey',
 'travelled',
 'saudi',
 'arabia',
 'indicating',
 'planned',
 'murder',
 'khashoggi',
 '59',
 'washington',
 'post',
 'columnist',
 'critic',
 'powerful',


In [4]:
def convert_text_to_dtm(txt):
    '''
    This function takes a list that contains each word of the text as input and outputs a dataframe that counts the appearance of each word.
    
    Arguements
    ----------
    txt: a list that contains each word of the text.
    
    Return
    ------
    a dataframe that counts the appearance of each word.
    '''
    d = dict()#create a dictionary to count the appearance.
    for word in get_text_txt(txt):#if the word is in the dict, add one to the value. if not, create a new key.
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

convert_text_to_dtm('aljazeera-khashoggi.txt')#test run

Unnamed: 0,turkey,istanbul,turkish,president,recep,tayyip,erdogan,murder,journalist,jamal,...,aljubeir,terrible,tragedy,taha,ozhan,research,director,institute,saudis,cooperation
0,4,5,7,4,1,1,12,6,1,1,...,1,1,1,1,1,1,1,1,2,1


In [5]:
#create a list that contains all the directorys(in this case it's the name because my ipynb file and my data are in the same floder).
texts_list= ['aljazeera-khashoggi.txt', 'bbc-khashoggi.txt', 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt', 'fox-khashoggi.txt']

In [6]:
# Now build a function that does this for a list of texts
def gen_DTM(texts=None):
    '''
    This function Generate a document term matrix.
    
    Arguements
    ----------
    texts: a list that contains all the dirctorys that you want to include.
    
    Return
    ------
    a dataframe that counts the appearance of each word of each txt file.
    '''
    DTM = pd.DataFrame()#create a new dataframe
    for text in texts:#convert the appearance of words of each file and combine them.
        entry = convert_text_to_dtm(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM
      
gen_DTM(texts_list) 

Unnamed: 0,1,108,11,12,15,15member,18,2,28,2r,...,“may,“other,“partner,“putting,“saudi,“the,“to,“we,“were,“why
0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [7]:
def cosine(a,b):
    '''
    This function calculate the the cosine of two vectors
    
    Arguements
    ----------
    a,b: vectors
    
    Return
    ------
    the value of cosine of these two vectors
    '''
    cos = np.dot(a,b)/(np.sqrt(np.dot(a,a)) * np.sqrt(np.dot(b,b)))#calculate the cosine
    return cos

In [8]:
similarities_without_stop = {}#create a new dict.
for i in range(gen_DTM(texts_list) .shape[0]):#calculate the cosine of each two combinaton and append it to the dictionary with there names as key.
    for j in range(i + 1, gen_DTM(texts_list).shape[0]):
        similarities_without_stop[texts_list[i], texts_list[j]] = cosine(gen_DTM(texts_list).iloc[i].values, gen_DTM(texts_list).iloc[j].values)

In [9]:
similarities_without_stop

{('aljazeera-khashoggi.txt', 'bbc-khashoggi.txt'): 0.6789384344078828,
 ('aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt'): 0.5549596552430964,
 ('aljazeera-khashoggi.txt', 'cnn-khashoggi.txt'): 0.5331228099011469,
 ('aljazeera-khashoggi.txt', 'fox-khashoggi.txt'): 0.6575655805929,
 ('bbc-khashoggi.txt', 'breitbart-khashoggi.txt'): 0.5647225050115716,
 ('bbc-khashoggi.txt', 'cnn-khashoggi.txt'): 0.5039192189493414,
 ('bbc-khashoggi.txt', 'fox-khashoggi.txt'): 0.6062429083809844,
 ('breitbart-khashoggi.txt', 'cnn-khashoggi.txt'): 0.3527237380899392,
 ('breitbart-khashoggi.txt', 'fox-khashoggi.txt'): 0.5182983952188844,
 ('cnn-khashoggi.txt', 'fox-khashoggi.txt'): 0.5137505781798409}

### Calculate the similarities with stopwords(in this part, the fuctions are basically the same except a minor change in the first fuction)

In [10]:
def get_text_txt_with(address):
    '''
    This function takes a txt file's directory as input and outputs a list that contains each word of the text.
    
    Arguements
    ----------
    address: a txt file's directory.
    
    Return
    ------
    a list that contains each word of the txt file.
    '''
    text_as_list = []#create a new list.
    with open(address) as f:#append each lineofnthe file to the list.
        for line in f:
            text_as_list.append(line)
    text_as_list = text_as_list[0].lower().translate(str.maketrans('', '', string.punctuation)).split()#make the content to lower case, remove all punctuation and split to single word.
    
    return text_as_list

get_text_txt_with('aljazeera-khashoggi.txt')#test run

['turkey',
 'istanbul',
 'turkish',
 'president',
 'recep',
 'tayyip',
 'erdogan',
 'has',
 'said',
 'the',
 'murder',
 'of',
 'journalist',
 'jamal',
 'khashoggi',
 'at',
 'the',
 'kingdoms',
 'consulate',
 'in',
 'istanbul',
 'was',
 'planned',
 'by',
 'saudi',
 'officials',
 'days',
 'in',
 'advance',
 'addressing',
 'legislators',
 'from',
 'his',
 'justice',
 'and',
 'development',
 'party',
 'ak',
 'party',
 'on',
 'tuesday',
 'erdogan',
 'detailed',
 'khashoggis',
 'disappearance',
 'and',
 'murder',
 'but',
 'stopped',
 'short',
 'of',
 'accusing',
 'saudi',
 'royals',
 'of',
 'the',
 'savage',
 'killing',
 'that',
 'has',
 'caused',
 'global',
 'outrage',
 'on',
 'september',
 '28',
 'khashoggi',
 'arrived',
 'at',
 'the',
 'saudi',
 'arabian',
 'consulate',
 'for',
 'him',
 'to',
 'sort',
 'out',
 'his',
 'wedding',
 'paperwork',
 'erdogan',
 'said',
 'during',
 'the',
 'speech',
 'in',
 'the',
 'turkish',
 'parliament',
 'in',
 'the',
 'capital',
 'ankara',
 'it',
 'seems',


In [11]:
def convert_text_to_dtm_with(txt):
    '''
    This function takes a list that contains each word of the text as input and outputs a dataframe that counts the appearance of each word.
    
    Arguements
    ----------
    txt: a list that contains each word of the text.
    
    Return
    ------
    a dataframe that counts the appearance of each word.
    '''
    d = dict()#create a dictionary to count the appearance.
    for word in get_text_txt_with(txt):#if the word is in the dict, add one to the value. if not, create a new key.
        if word in d:
            d[word][0] += 1
        else:
            d[word] = [1]
    return pd.DataFrame(d)

convert_text_to_dtm_with('aljazeera-khashoggi.txt')#test run

Unnamed: 0,turkey,istanbul,turkish,president,recep,tayyip,erdogan,has,said,the,...,saudis,know,very,well,knows,doing,namely,asking,full,cooperation
0,4,5,7,4,1,1,12,6,6,38,...,2,1,1,1,1,1,1,1,1,1


In [12]:
# Now build a function that does this for a list of texts
def gen_DTM_with(texts=None):
    '''
    This function Generate a document term matrix.
    
    Arguements
    ----------
    texts: a list that contains all the dirctorys that you want to include.
    
    Return
    ------
    a dataframe that counts the appearance of each word of each txt file.
    '''
    DTM = pd.DataFrame()#create a new dataframe
    for text in texts:#convert the appearance of words of each file and combine them.
        entry = convert_text_to_dtm_with(text)
        DTM = DTM.append(pd.DataFrame(entry),ignore_index=True,sort=True) # Row bind
    
    DTM.fillna(0, inplace=True) # Fill in any missing values with 0s (i.e. when a word is in one text but not another)
    return DTM
      
gen_DTM_with(texts_list) 

Unnamed: 0,1,108,11,12,15,15member,18,2,28,2r,...,“may,“other,“partner,“putting,“saudi,“the,“to,“we,“were,“why
0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [13]:
similarities_with_stop = {}#create a new dict.
for i in range(gen_DTM_with(texts_list) .shape[0]):#calculate the cosine of each two combinaton and append it to the dictionary with there names as key.
    for j in range(i + 1, gen_DTM_with(texts_list).shape[0]):
        similarities_with_stop[texts_list[i], texts_list[j]] = cosine(gen_DTM_with(texts_list).iloc[i].values, gen_DTM_with(texts_list).iloc[j].values)

In [14]:
similarities_with_stop

{('aljazeera-khashoggi.txt', 'bbc-khashoggi.txt'): 0.8704785650109934,
 ('aljazeera-khashoggi.txt', 'breitbart-khashoggi.txt'): 0.8307040967375,
 ('aljazeera-khashoggi.txt', 'cnn-khashoggi.txt'): 0.7346592558334714,
 ('aljazeera-khashoggi.txt', 'fox-khashoggi.txt'): 0.837865118014574,
 ('bbc-khashoggi.txt', 'breitbart-khashoggi.txt'): 0.8938858587288907,
 ('bbc-khashoggi.txt', 'cnn-khashoggi.txt'): 0.7440412240226454,
 ('bbc-khashoggi.txt', 'fox-khashoggi.txt'): 0.885635500576476,
 ('breitbart-khashoggi.txt', 'cnn-khashoggi.txt'): 0.681866663912452,
 ('breitbart-khashoggi.txt', 'fox-khashoggi.txt'): 0.865128633261706,
 ('cnn-khashoggi.txt', 'fox-khashoggi.txt'): 0.736447336924943}

We can see that the combination of 'aljazeera-khashoggi.txt', 'bbc-khashoggi.txt' and 'aljazeera-khashoggi.txt', 'fox-khashoggi.txt' have more similarities(without stopwords). 'breitbart-khashoggi.txt', 'cnn-khashoggi.txt' tell the story more differently than others(without stopwords).

It is clear that with stop words, on average the similarities are higher.