# Project Milestone 2

In [None]:
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import bz2
import json
from urllib.parse import urlparse
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import os
import glob
import pickle

## Part 1: Filtering

In [3]:
#Searching for Missing Values
def MissingValuesFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')

        #Gives True as a result, if the lines where NaN are presents are empty 
        #(meaning there are no NaNs):
        missing_nan = pd.DataFrame(np.where(chunk.isnull().any(axis=1))).empty
        #Gives True as a result if the lignes and corresponding columns where 
        #zeros, '' and None are found are empty:
        missing_zeros = pd.DataFrame(np.where(chunk==0)).empty 
        missing_space = pd.DataFrame(np.where(chunk=='')).empty
        missing_none = pd.DataFrame(np.where(chunk==None)).empty

        print('This chunk does not contain missing values')
        print(missing_nan, missing_zeros, missing_space, missing_none) 

PATH = './quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    MissingValuesFilter_chunk(chunk)

Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 rows
This chunk does not contain missing values
True True True True
Processing chunk with 250000 r

In [5]:
#Checking if Number of Occurrences is coherent
def NumOccurrencesFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
        size = len(chunk)
        #To have a number of occurrences equal zero has no meaning, so here it checks if this data set has a logical 
        #number of occurrences
        if chunk['numOccurrences'].all() == np.array(size*[0]).all():
          print('Number of Occurrences impossible')
        else :
          print('Number of Occurences possible')

PATH = './quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    NumOccurrencesFilter_chunk(chunk)

Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences posible
Processing chunk with 250000 rows
Number of Occurences 

In [7]:
#Checking the probabilities of the autors
def ProbasFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
        size = len(chunk)
        #Here it is checked that the first probability for the autor is higher than 50%
        #If this probability is lower, we will chose to discard the corresponding quotation because we consider that 
        #we are not sure enough of who is the autor of the quotation
        if chunk['probas'].str[0].str[1].astype(float).all() < np.array(size*[0.5]).all():
          print('Probability of Autors of quotation too low')
        else :
          print('Probability of Autors higher than 50%')

PATH = './quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    ProbasFilter_chunk(chunk)

Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher than 50%
Processing chunk with 250000 rows
Probability of Autors higher t

In [8]:
#Checking if the autor is the one with the highest probability of having generated the quotation
def ProbasFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
  
        if chunk['speaker'].all() != chunk['probas'].str[0].str[0].all():
          print('The Autor does not correspond to the high probability Autor')
        else :
          print('The Autor has a high probability of having generated the quotation')

PATH = './quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    ProbasFilter_chunk(chunk)

Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the quotation
Processing chunk with 250000 rows
The Autor has a high probability of having generated the 

In [None]:
#Filtering the Data with Streaming 

#THIS PART IS ONLY NEEDED IF THE TESTS FROM BEFORE SHOWED PROBLEMS
#This is not the case for the year 2020 that was checked here, but could be for the other years

#If the above checks show that there is a missing values/incoherence or problem in the data for any year, 
#this code could remove the above mentionned problems and creates a new file with the filtered data, that could be used 
#for the rest of the project

path_to_file = './quotes-2020.json.bz2' 
path_to_out = './quotes-2020-filtered.json.bz2'


with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #Removes '', None, 0 and False from the quotation in instance:
            if any([bool(instance['quotation']) == True for quotation in instance]): 
              #If the probability the quotation comes from a speaker is lower 
              #than 50% than we discard this probability (as not high enough):
              if float(instance['probas'][0][1]) > 0.5: 
                #Removes the lines that are not coherents with respect to the speakers probabilities
                if instance['speaker'] == instance['probas'][0][0]: 
                  if instance['numOccurrences'] != 0: #Removes incoherent occurences
                    d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file


## Part 2: Feasability of the Project

### Creation of the useful DataFrames

### 1) Streaming to create a new file

In [12]:
#Creation of a new file with a new column for the domain (cf colab)
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

In [13]:
#Creation of a file with additional columns: the domain and the name of the intenet website
path_to_file = 'quotes-2017.json.bz2' 
path_to_out = '2017a.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            website = []
            for url in urls:
                tld = get_domain(url)
                domains.append(tld)
                net = urlparse(url)
                neto=net.netloc
                website.append(neto)
            instance['domains'] = domains # updating the sample with domain name
            instance['website'] = website
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

### 2) Cleanning and Storage of the Data for One Year

In [51]:
#Cleaning of the 2020 Data
df = pd.DataFrame() #Defining the DataFrame that will contain the wanted chucks

def process_chunk(chunk, df):
        df_1 = pd.DataFrame() #Initialising intermediate DataFrame
        df_2 = pd.DataFrame() #Initialising intermediate DataFrame

        # Calculates duplicated rows in the chunk
        num_duplicates = len(chunk[chunk.duplicated(subset=["quoteID"])])
        print("There were {} duplicated rows".format(num_duplicates))

        # Removes duplicated rows
        df = chunk.drop_duplicates(subset=["quoteID"], keep="first")

        df = chunk.drop(['quoteID', 'qids', 'numOccurrences', 'probas', 'urls', 'phase', 'domains'], axis=1) #axis=1 for columns
        
        df['website'] = df['website'].astype('str') #Converting to string to be able to use str.contains 
        df_1 = df[df['website'].str.contains('foxnews')] #Creating a DataFrame containing only foxnews
        df_2 = df[df['website'].str.contains('nytimes')] #Creating a DataFrame containing only nytimes
        df = pd.concat([df_1, df_2])
        #Some speakers had their names not starting with a capslocks. This line of code corrects it:
        df["speaker"] = df["speaker"].apply(str.title)

        print(df.shape)
        print(f'Processing chunk with {len(chunk)} rows')
        return df
        

for chunk in pd.read_json('./2020a.json.bz2', lines=True, compression='bz2', chunksize=250000):
    df = process_chunk(chunk, df)
    with open('ReadyToUseData.pickle', 'ab') as file: #Opening pickle file in append mode, to keep the previous data of the file
        pickle.dump(df, file) #Adding the new chunks to the pickle file

There were 0 duplicated rows
(4538, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4590, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4473, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4520, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4754, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4642, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4508, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4665, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4530, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4586, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4617, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4428, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4680, 4)
Processing chunk with 250000 rows
There were 0 duplicated rows
(4736, 4)
Processing c

### 3) Handling the entire Data Set

Here a DataFrame is created over the 5 years period, with the quotations containing a specific word

In [13]:
#Here "immigrat" is chosen as the key word
path = os.getcwd()
json_files = glob.glob(os.path.join(path, "*.json.bz2"))
lista='quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences','probas', 'urls', 'phase', 'domains', 'website'
df2=pd.DataFrame(columns=lista) 

def process_chunk(chunk,df2):
        immi=pd.DataFrame()
        immi=chunk[chunk['quotation'].str.contains("immigrat")]
        df2 = df2.append(immi)
        return df2
    
for f in json_files:      
    for chunk in pd.read_json(f, lines=True, compression='bz2', chunksize=100000, encoding='utf-8'):
        df2=process_chunk(chunk,df2)
    print('File finished')
print('Folder finished')

File finished
Folder finished


In [16]:
#Dropping dupliactes, if any
num_duplicates2 = len(df2[df2.duplicated(subset=["quoteID"])])
print("There were {} duplicated rows".format(num_duplicates2))
duplica2 = df2.drop_duplicates(subset=["quoteID"], keep="first")

There were 0 duplicated rows


In [17]:
#Creation of the dataframe from the 5 years with the citations of a particular website
path = os.getcwd()
json_files = glob.glob(os.path.join(path, "*.json.bz2"))
lista='quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences','probas', 'urls', 'phase', 'domains', 'website'
df3=pd.DataFrame(columns=lista) 

def process_chunk(chunk,df3):
        chunk['website']=chunk['website'].astype('str')
        journal=pd.DataFrame()
        journal=chunk[chunk['website'].str.contains("www.foxnews.com")]
        df3 = df3.append(journal)
        return df3
    
for f in json_files:      
    for chunk in pd.read_json(f, lines=True, compression='bz2', chunksize=25000, encoding='utf-8'):
        df3=process_chunk(chunk,df3)
    print('File finished')
print('Folder finished')

File finished
Folder finished


In [20]:
df3.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains,website
314,2020-01-28-013666,Coach is an awesome dude to play for. He is 10...,Eric Fisher,"[Q30075239, Q5386483]",2020-01-28 12:30:44,26,"[[Eric Fisher, 0.9426], [None, 0.0574]]",[https://www.seattletimes.com/sports/nba/chief...,E,"[com, com, com, com, com, com, com, com, com, ...","['www.seattletimes.com', 'www-1.thenewstribune..."
409,2020-02-20-015931,"Finally, we reached the point a few weeks ago ...",Elizabeth Warren,[Q434706],2020-02-20 00:00:00,2,"[[Elizabeth Warren, 0.7201], [None, 0.2508], [...",[http://feeds.foxnews.com/~r/foxnews/politics/...,E,"[com, com]","['feeds.foxnews.com', 'www.foxnews.com']"
421,2020-02-07-020769,"For example, multiple employees recalled a per...",,[],2020-02-07 00:00:00,1,"[[None, 0.6712], [Andrew Yang, 0.3288]]",[http://www.foxnews.com/media/new-york-times-a...,E,[com],['www.foxnews.com']
458,2020-01-31-022641,Given the partisan nature of this impeachment ...,Lisa Murkowski,[Q22360],2020-01-31 00:00:00,24,"[[Lisa Murkowski, 0.6433], [None, 0.224], [Joh...",[http://feeds.foxnews.com/~r/foxnews/politics/...,E,"[com, com, com, com, com, com, com, com, com, ...","['feeds.foxnews.com', 'www.foxnews.com', 'www...."
516,2020-01-08-022441,He was someone you could lean on.,Mark Dantonio,[Q6767270],2020-01-08 00:00:00,8,"[[Mark Dantonio, 0.7737], [None, 0.208], [Geor...",[https://www.foxnews.com/sports/george-perles-...,E,"[com, com, com, com, com, com, com, com]","['www.foxnews.com', 'crainsdetroit.com', 'detr..."


In [21]:
#Dropping dupliactes, if any
num_duplicates3 = len(df3[df3.duplicated(subset=["quoteID"])])
print("There were {} duplicated rows".format(num_duplicates3))
duplica3 = df3.drop_duplicates(subset=["quoteID"], keep="first")

There were 0 duplicated rows


In [22]:
#Checking that the column was well added
lis=df2['website'].value_counts().to_frame()
lis.head(5)

Unnamed: 0,website
[www.breitbart.com],67
[www.foxnews.com],28
[www.msn.com],25
[www.politico.com],22
[www.independent.co.uk],21


In [23]:
#Visualisation of higher quoted speakers
lis2=df3['speaker'].value_counts().to_frame()
lis2.head(5)

Unnamed: 0,speaker
,10923
President Trump,779
Joe Biden,489
Bernie Sanders,441
President Donald Trump,414


### Sentimental Analysis

In [24]:
#Changing the website column type to string
df2['website']=df2['website'].astype('str')

In [39]:
#Creation of DataFrames for some journals
fox=df2[df2['website'].str.contains("www.foxnews.com")]
ny=df2[df2['website'].str.contains("www.nytimes.com")]
brei=df2[df2['website'].str.contains("www.breitbart.com")]
cnn=df2[df2['website'].str.contains("cnn.com")]
guard=df2[df2['website'].str.contains("www.theguardian.com")]
slate=df2[df2['website'].str.contains("slate.com")]
buzz=df2[df2['website'].str.contains("buzzfeed.com")]

In [40]:
fox_quotes=fox['quotation']
ny_quotes=ny['quotation']
brei_quotes=brei['quotation']
cnn_quotes=cnn['quotation']
guard_quotes=guard['quotation']
slate_quotes=slate['quotation']
buzz_quotes=buzz['quotation']

In [41]:
sia = SentimentIntensityAnalyzer()

In [42]:
fox_length=len(fox)
ny_length=len(ny)
brei_length=len(brei)
cnn_length=len(cnn)
guard_length=len(guard)
slate=len(slate)
buzz_length=len(buzz)

In [43]:
#Sentimental Analysis for positif and negatif feelings for the previously chosen articles with df2
pos=0
neg=0
average_pos=0
average_neg=0
for quotation in fox_quotes:
    result=sia.polarity_scores(quotation)
    pos+=result["pos"]
    neg+=result["neg"]
average_pos=pos/fox_length
average_neg=neg/fox_length
print('Mean of positif and negatif feelings found respectively are:', average_pos,average_neg)

Mean of positif and negatif feelings found respectively are: 0.11661666666666663 0.08851666666666672


In [47]:
pos=0
neg=0
average_pos=0
average_neg=0
for quotation in ny_quotes:
    result=sia.polarity_scores(quotation)
    pos+=result["pos"]
    neg+=result["neg"]
average_pos=pos/ny_length
average_neg=neg/ny_length
print('Mean of positif and negatif feelings found respectively are:', average_pos,average_neg)

Mean of positif and negatif feelings found respectively are: 0.12173076923076925 0.0723846153846154


By comparing the results of the sentimental analysis on both medias (Foxnews and New York Times), we can see that the New York Times is more positive about immigration while Foxnews is more negative. Nevertheless, one should check if these differences are significative or not.  

In [44]:
df3_quotes=df3['quotation']
df3size=len(df3_quotes)

In [45]:
#Sentimental Analysis for positif and negatif feelings for the previously chosen articles with df3
pos=0
neg=0
average_pos=0
average_neg=0
for quotation in df3_quotes:
    result=sia.polarity_scores(quotation)
    pos=pos+result["pos"]
    neg=neg+result["neg"]
average_pos=pos/df3size
average_neg=neg/df3size
print('Mean of positif and negatif feelings found respectively',average_pos,average_neg)

Mean of positif and negatif feelings found respectively 0.11050858949963867 0.0794943585345589


### Time analysis

In [48]:
#Creation of a mask to analyse all the quotations from a certain time period. Here, we use the dataframe with foxnews's quotations
mask = (df3['date'] > '2020-03-19') & (df3['date'] <= '2020-04-30')
df_time=df3.loc[mask]
df_time.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,domains,website
981,2020-04-09-028183,in the pocket of China,Tom Cotton,[Q3090307],2020-04-09 00:00:00,2,"[[Tom Cotton, 0.7595], [None, 0.2075], [Tedros...",[http://www.foxnews.com/media/sen-tom-cotton-w...,E,"[com, com]","['www.foxnews.com', 'www.newsmax.com']"
1010,2020-04-10-024002,"It doesn't appear salvageable,",,[],2020-04-10 00:00:00,29,"[[None, 0.6247], [Donovan Mitchell, 0.1627], [...",[http://www.foxnews.com/sports/utah-jazz-rudy-...,E,"[com, com, com, com, com, com, com, com.au, co...","['www.foxnews.com', 'www.sportsoutwest.com', '..."
1771,2020-04-08-052001,"The Pacific commander, Admiral Davidson, order...",Kirk Lippold,[Q6415477],2020-04-08 00:00:00,1,"[[Kirk Lippold, 0.8545], [None, 0.1455]]",[http://www.foxnews.com/media/ex-uss-cole-comm...,E,[com],['www.foxnews.com']
2829,2020-03-30-008233,"Certainly, he has a lot of great qualities tha...",Matthew Slater,"[Q52906896, Q6791235]",2020-03-30 21:15:42,5,"[[Matthew Slater, 0.5999], [None, 0.3473], [Ja...",[https://www.bostonherald.com/2020/03/30/patri...,E,"[com, com, com, com, com]","['www.bostonherald.com', 'www.foxnews.com', 'w..."
4966,2020-03-25-093901,While withholding the notes and many other exa...,,[],2020-03-25 00:00:00,43,"[[None, 0.9002], [Lori Loughlin, 0.0828], [Wil...",[http://kaaltv.com/national-news/loughlin-gian...,E,"[com, com, com, com, com, com, com, com, com, ...","['kaaltv.com', 'kstp.com', 'www.nbcchicago.com..."


### Most commom words 

In [50]:
#Most common words used in each newspaper for the year 2020
from collections import Counter
Counter(" ".join(df3["quotation"]).split()).most_common(100)
output = [' '.join([word
   for word in sentence.split() if len(word) > 4
]) for sentence in df3["quotation"]]
Counter(" ".join(output).split()).most_common(100)

[('going', 3202),
 ('people', 3025),
 ('about', 2788),
 ('think', 2640),
 ('their', 2399),
 ("don't", 2050),
 ('would', 2019),
 ('because', 1766),
 ('there', 1558),
 ('really', 1457),
 ('other', 1214),
 ('Trump', 1198),
 ('these', 1195),
 ('right', 1185),
 ('those', 1151),
 ('should', 1139),
 ("that's", 1122),
 ("we're", 1094),
 ('could', 1080),
 ('which', 1023),
 ('where', 998),
 ('being', 925),
 ('president', 922),
 ('every', 856),
 ('President', 813),
 ('health', 810),
 ('things', 792),
 ('never', 779),
 ('American', 749),
 ('doing', 749),
 ('first', 746),
 ("didn't", 732),
 ('through', 707),
 ("you're", 689),
 ("can't", 675),
 ('something', 666),
 ("That's", 655),
 ('still', 651),
 ("they're", 648),
 ('There', 633),
 ('after', 629),
 ('great', 614),
 ('public', 611),
 ('Biden', 610),
 ('against', 604),
 ('country', 594),
 ('thing', 591),
 ("We're", 590),
 ('that,', 572),
 ('Democratic', 564),
 ('years', 555),
 ('Bernie', 551),
 ('always', 544),
 ('trying', 534),
 ('believe', 518),


Here one can see that the most used words are connectors or verbs. The next step would be to print only the most occuring nouns or proper nouns. This could be done using NLTK library.