In [1]:
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import bz2
import json
from urllib.parse import urlparse
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import os
import glob

## First Step: Filtering

In [None]:
#Searching for Missing Values
def MissingValuesFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')

        #Gives True as a result, if the lines where NaN are presents are empty 
        #(meaning there are no NaNs):
        missing_nan = pd.DataFrame(np.where(chunk.isnull().any(axis=1))).empty
        #Gives True as a result if the lignes and corresponding columns where 
        #zeros, '' and None are found are empty:
        missing_zeros = pd.DataFrame(np.where(chunk==0)).empty 
        missing_space = pd.DataFrame(np.where(chunk=='')).empty
        missing_none = pd.DataFrame(np.where(chunk==None)).empty

        print('This chunk does not contain missing values')
        print(missing_nan, missing_zeros, missing_space, missing_none) 

PATH = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    MissingValuesFilter_chunk(chunk)

In [None]:
#Checking if Number of Occurrences is coherent
def NumOccurrencesFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
        size = len(chunk)
        #To have a number of occurrences equal zero has no meaning, so here it checks if this data set 
        if chunk['numOccurrences'].all() == np.array(size*[0]).all():
          print('Number of Occurrences impossible')
        else :
          print('Number of Occurences posible')

PATH = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    NumOccurrencesFilter_chunk(chunk)

In [None]:
#Checking if the Highest Probability Corresponds to the Autor of the quotation
def ProbasFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
  
        if chunk['probas'].str[0].str[1].astype(float).all() < np.array(size*[0.5]).all():
          print('Probability of Autors of quotation too low')
        else :
          print('Probability of Autors higher than 50%')

PATH = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    ProbasFilter_chunk(chunk)

In [None]:
#Checking if the Autor is the one with the highest probability of having generated the quotation
def ProbasFilter_chunk(chunk):
        print(f'Processing chunk with {len(chunk)} rows')
  
        if chunk['speaker'].all() != chunk['probas'].str[0].str[0].all():
          print('The Autor does not correspond to the high probability Autor')
        else :
          print('The Autor has a high probability of having generated the quotation')

PATH = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2'

for chunk in pd.read_json(PATH, lines=True, compression='bz2', chunksize=250000):
    ProbasFilter_chunk(chunk)

In [None]:
#Filtering the Data with Streaming 

#THIS PART IS ONLY NEEDED IF THE TESTS FROM BEFORE SHOWED PROBLEMS

#If the above checks show that there is a missing values/incoherence or problem
#in the data for any year, this code could remove the above mentionned problems 
#and creates a new file with the filtered data
path_to_file = '/content/drive/MyDrive/Quotebank/quotes-2020.json.bz2' 
path_to_out = '/content/drive/MyDrive/ADA_2021/quotes-2020-filtered.json.bz2'


with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            #Removes '', None, 0 and False from the quotation in instance:
            if any([bool(instance['quotation']) == True for quotation in instance]): 
              #If the probability the quotation comes from a speaker is lower 
              #than 50% than we discard this probability (as not high enough):
              if float(instance['probas'][0][1]) > 0.5: 
                #Removes the lines that are not coherents with respect to the speakers probabilities
                if instance['speaker'] == instance['probas'][0][0]: 
                  if instance['numOccurrences'] != 0: #Removes incoherent occurences
                    d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file


## Part 2: Creation of the DataFrames

### 1) Cleanning and Storage of the Data for One Year

In [None]:
#Cleaning of the 2020 Data
df = pd.DataFrame() #Defining the DataFrame that will contain the wanted chucks

def process_chunk(chunk, df):
        df_1 = pd.DataFrame() #Initialising intermediate DataFrame
        df_2 = pd.DataFrame() #Initialising intermediate DataFrame

        # Calculates duplicated rows in the chunk
        num_duplicates = len(chunk[chunk.duplicated(subset=["quoteID"])])
        print("There were {} duplicated rows".format(num_duplicates))

        # Removes duplicated rows
        df = chunk.drop_duplicates(subset=["quoteID"], keep="first")

        df = chunk.drop(['quoteID', 'qids', 'numOccurrences', 'probas', 'urls', 'phase', 'domains'], axis=1) #axis=1 for columns
        
        df['website'] = df['website'].astype('str') #Converting to string to be able to use str.contains 
        df_1 = df[df['website'].str.contains('foxnews')] #Creating a DataFrame containing only foxnews
        df_2 = df[df['website'].str.contains('nytimes')] #Creating a DataFrame containing only nytimes
        df = pd.concat([df_1, df_2])

        print(df.shape)
        print(f'Processing chunk with {len(chunk)} rows')
        return df
        

for chunk in pd.read_json('/content/drive/MyDrive/ADA_2021/2020a.json.bz2', lines=True, compression='bz2', chunksize=250000):
    df = process_chunk(chunk, df)
    #df.to_pickle('/content/drive/MyDrive/ADA_2021/SelectedData.pkl') #Transforming the df DataFrame into a pickle file
    with open('ReadyToUseData.pickle', 'ab') as file: #Opening pickle file in append mode, to keep the previous data of the file
        pickle.dump(df, file) #Adding the new chunks to the pickle file

### 2) Dealing with the entire Data Set

In [12]:
#Creation of a new file with a new column for the domain (cf colab)
from tld import get_tld

def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.tld

In [13]:
#Creation of a file with additional columns: the domain and the name of the intenet website
path_to_file = 'quotes-2017.json.bz2' 
path_to_out = '2017a.json.bz2'

with bz2.open(path_to_file, 'rb') as s_file:
    with bz2.open(path_to_out, 'wb') as d_file:
        for instance in s_file:
            instance = json.loads(instance) # loading a sample
            urls = instance['urls'] # extracting list of links
            domains = []
            website = []
            for url in urls:
                tld = get_domain(url)
                domains.append(tld)
                net = urlparse(url)
                neto=net.netloc
                website.append(neto)
            instance['domains'] = domains # updating the sample with domain name
            instance['website'] = website
            d_file.write((json.dumps(instance)+'\n').encode('utf-8')) # writing in the new file

In [None]:
#Creation of the DataFrame over the 5 years period, with the quotations containing a specific word. Here "immigrat"is chosen. 
path = os.getcwd()
json_files = glob.glob(os.path.join(path, "*.json.bz2"))
lista='quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences','probas', 'urls', 'phase', 'domains', 'website'
df2=pd.DataFrame(columns=lista) 

def process_chunk(chunk,df2):
        immi=pd.DataFrame()
        immi=chunk[chunk['quotation'].str.contains("immigrat")]
        df2 = df2.append(immi)
        return df2
    
for f in json_files:      
    for chunk in pd.read_json(f, lines=True, compression='bz2', chunksize=100000, encoding='utf-8'):
        df2=process_chunk(chunk,df2)
    print('File finished')
print('Folder finished')

In [None]:
#Dropping dupliactes, if any
duplica2 = df2.drop_duplicates(subset=["quoteID"], keep="first")

In [None]:
#Creation du dataframe à partir des 5 années avec les citations d'un site en particulier
path = os.getcwd()
json_files = glob.glob(os.path.join(path, "*.json.bz2"))
lista='quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences','probas', 'urls', 'phase', 'domains', 'website'
df3=pd.DataFrame(columns=lista) 

def process_chunk(chunk,df3):
        chunk['website']=chunk['website'].astype('str')
        journal=pd.DataFrame()
        journal=chunk[chunk['website'].str.contains("www.foxnews.com")]
        df3 = df3.append(journal)
        return df3
    
for f in json_files:      
    for chunk in pd.read_json(f, lines=True, compression='bz2', chunksize=25000, encoding='utf-8'):
        df3=process_chunk(chunk,df3)
    print('File finished')
print('Folder finished')

In [None]:
df3.head(10)

In [10]:
#Dropping dupliactes, if any
duplica3 = df3.drop_duplicates(subset=["quoteID"], keep="first")

(3456, 11)

In [40]:
#Checking that the column was well added
lis=df2['website'].value_counts().to_frame()
lis.head(5)

Unnamed: 0,website
['www.msn.com'],1202
['express.co.uk'],1184
['www.cheatsheet.com'],925
['www.bostonglobe.com'],769
['www.foxnews.com'],763
['timesofindia.indiatimes.com'],745
['www.dailystar.co.uk'],742
['www.telegraph.co.uk'],741
['indianexpress.com'],741
['www.thesun.co.uk'],730


In [15]:
#Visualisation of higher quoted speakers
lis2=df3['speaker'].value_counts().to_frame()
lis2.head(5)

Unnamed: 0,speaker
,84128
President Donald Trump,1258
Bernie Sanders,595
Joe Biden,581
President Trump,546
...,...
Kirstjen Nielsen,1
Jennifer Anderson,1
alan tongue,1
Wes Welker,1


### Part 3: Sentimental Analysis

In [3]:
#Changing the website column type to string
df2['website']=df2['website'].astype('str')

In [52]:
#Creation of DataFrames for some journals
fox=df2[df2['website'].str.contains("www.foxnews.com")]
ny=df2[df2['website'].str.contains("www.nytimes.com")]
brei=df2[df2['website'].str.contains("www.breitbart.com")]
cnn=df2[df2['website'].str.contains("cnn.com")]
guard=df2[df2['website'].str.contains("www.theguardian.com")]
slate=df2[df2['website'].str.contains("slate.com")]
buzz=df2[df2['website'].str.contains("buzzfeed.com")]

In [53]:
fox_quotes=fox['quotation']
ny_quotes=ny['quotation']
brei_quotes=brei['quotation']
cnn_quotes=cnn['quotation']
guard_quotes=guard['quotation']
slate_quotes=slate['quotation']
buzz_quotes=buzz['quotation']

In [8]:
sia = SentimentIntensityAnalyzer()

In [8]:
fox.shape
ny.shape
brei.shape
cnn.shape
guard.shape
slate.shape
buzz.shape

(1787, 11)

In [16]:
#Sentimental Analysis for positif and negatif feelings for the previously chosen articles with df2
pos=0
neg=0
average_pos=0
average_neg=0
for quotation in fox_quotes:
    result=sia.polarity_scores(quotation)
    pos=pos+result["pos"]
    neg=neg+result["neg"]
average_pos=pos/1787
average_neg=neg/1787
print('Mean of positif and negatif feelings found respectively', average_pos,average_neg)

0.11182372691661988 0.07998601007274761


In [12]:
df3_quotes=df3['quotation']
df3size=df3_quotes.shape(1)

In [13]:
#Sentimental Analysis for positif and negatif feelings for the previously chosen articles with df3
pos=0
neg=0
average_pos=0
average_neg=0
for quotation in df3_quotes:
    result=sia.polarity_scores(quotation)
    pos=pos+result["pos"]
    neg=neg+result["neg"]
average_pos=pos/df3size
average_neg=neg/df3size
print('Mean of positif and negatif feelings found respectively',average_pos,average_neg)

0.09833275462962988 0.08480266203703696
