# Reading the data and saving it as pickles for all the years

In [None]:
from google.colab import drive
drive._mount('/content/drive')

In [None]:
!pip install pandas==1.0.5

In [None]:
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import bz2
import json
from urllib.parse import urlparse
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import os
import glob
import pickle

## Create Dataframe for each journal 


In [None]:
# Function that reads the chunks of the data for a given set of years and saves 
# each chunk as a pickle

def process_chunk(chunk, year, nb):
    df_chunk = pd.DataFrame()
    df = pd.DataFrame()
    
    df_chunk = chunk.drop(['qids', 'phase'], axis=1) # axis=1 for columns

    # Converting to string to be able to use str.contains 
    df_chunk['urls'] = df_chunk['urls'].astype('str')
    # Creating a DataFrame containing only foxnews 
    df_1 = df_chunk[df_chunk['urls'].str.contains('foxnews')] 
    df_2 = df_chunk[df_chunk['urls'].str.contains('nytimes')]
    df = pd.concat([df_1, df_2])

    df.to_pickle('/content/drive/MyDrive/ADA_2021/Fox_NY_' + str(year) + '/' + str(nb) + '_' +  str(year) + '_' + 'FoxNYtimes.pkl')

In [None]:
# Saving the chunks as pickles for all years of the data thanks to the 
# process_chunk function

years = [2015, 2016, 2017, 2018, 2019, 2020]
for y in years:
    nb = 1
    for chunk in pd.read_json('/content/drive/MyDrive/Quotebank/quotes-' + str(y) + '.json.bz2', 
                              lines=True, compression='bz2', chunksize=500000, encoding='utf-8'):
      
      process_chunk(chunk, y, nb)
      nb += 1

In [None]:
# Function that reads the pickles previously created and assign them to a 
# single DataFrame
def read_yearly(y):
    PATH = '/content/drive/MyDrive/ADA_2021/Fox_NY_' + str(y) + '/'
    nb = 1
    df1 = pd.DataFrame()
    df2 = []
    dirs = glob.glob(os.path.join(PATH, "*.pkl"))

    for files in dirs:
        df1 = pd.read_pickle(PATH + str(nb)+ '_' + str(y) + '_' + 'FoxNYtimes.pkl')
        df2.append(df1)
        nb += 1
    return df2

In [None]:
# Calls the function that reads the pickles for all the years of the data
years = [2015, 2016, 2017, 2018, 2019, 2020]
df_foxNY = pd.DataFrame()

for y in years:
  df_foxNY = df_foxNY.append(read_yearly(y))

In [None]:
df_foxNY.sample(3)

Unnamed: 0,quoteID,quotation,speaker,date,numOccurrences,probas,urls
9604225,2017-08-22-154821,who love our kids.,Peter Sellars,2017-08-22 14:43:19,3,"[[Peter Sellars, 0.8143], [None, 0.1392], [Joh...",['https://www.nytimes.com/2017/08/22/arts/musi...
7272252,2015-04-29-012495,"Money (Burns A Hole In My Pocket),",Dean Martin,2015-04-29 11:18:01,5,"[[Dean Martin, 0.6388], [None, 0.3241], [Jared...",['http://communityvoices.post-gazette.com/arts...
18635814,2017-05-12-029784,He likes to keep the brother role more than te...,Novak Djokovic,2017-05-12 00:45:55,1,"[[Novak Djokovic, 0.7629], [None, 0.2298], [An...",['https://www.nytimes.com/2017/05/11/sports/te...


## Checking if Filtering is Needed

In [None]:
# Calculates duplicated rows in the DataFrame
# There are duplicat rows in the DataFrame because sometimes both journal will provide the same quotation
num_duplicates = len(df_foxNY[df_foxNY.duplicated(subset=['quoteID', 'quotation', 'date', 'urls'])])
print("There are {} duplicated rows".format(num_duplicates))

#Removes duplicated rows
df_foxNY = df_foxNY.drop_duplicates(subset=['quoteID', 'quotation', 'date', 'urls'], keep="first")

There are 19385 duplicated rows


In [None]:
#Searching for Missing Values
def MissingValuesFilter(chunk):
        #Gives True as a result, if the lines where NaN are presents are empty 
        #(meaning there are no NaNs):
        missing_nan = pd.DataFrame(np.where(chunk.isnull().any(axis=1))).empty
        #Gives True as a result if the lignes and corresponding columns where 
        #zeros, '' and None are found are empty:
        missing_zeros = pd.DataFrame(np.where(chunk==0)).empty 
        missing_space = pd.DataFrame(np.where(chunk=='')).empty
        missing_none = pd.DataFrame(np.where(chunk==None)).empty
        #missing_brackets = pd.DataFrame(np.where(chunk==[])).empty

        print('This DataFrame does not contain missing values')
        print(missing_nan, missing_zeros, missing_space, missing_none) 

MissingValuesFilter(df_foxNY)

  res_values = method(rvalues)


This DataFrame does not contain missing values
True True True True


In [None]:
#Checking if Number of Occurrences is coherent
def NumOccurrencesFilter(chunk):
        size = len(chunk)
        #To have a number of occurrences equal zero has no meaning, so here it checks if this data set 
        if chunk['numOccurrences'].all() == np.array(size*[0]).all():
          print('Number of Occurrences impossible')
        else :
          print('Number of Occurences posible')

NumOccurrencesFilter(df_foxNy)

Number of Occurences posible


In [None]:
#Checking if the Highest Probability Corresponds to the Autor of the quotation
def ProbasFilter(chunk):
        size = len(chunk)
        if chunk['probas'].str[0].str[1].astype(float).all() < np.array(size*[0.8]).all():
          print('Probability of Autors of quotation too low')
        else :
          print('Probability of Autors higher than 80%')

ProbasFilter(df_foxNY)

Probability of Autors higher than 80%


In [None]:
#Checking if the Autor is the one with the highest probability of having generated the quotation
def ProbasFilter(chunk):  
        if chunk['speaker'].all() != chunk['probas'].str[0].str[0].all():
          print('The Autor does not correspond to the high probability Autor')
        else :
          print('The Autor has a high probability of having generated the quotation')

ProbasFilter(df_foxNY)

The Autor has a high probability of having generated the quotation


# Ceation of Pickle files with only quotations for each year and newspaper

In [None]:
# Here pickles files are created for each newspaper and each year with only
# the quotations. To later use such a data is better since not much space on 
# the RAM is needed. 
# Here it is done for Washington Post as it was the last newspaper used, but 
# the same proceedure was done for Fox News and New York Times

years = [2015, 2016, 2017, 2018, 2019, 2020]

for y in years:
  df_all = pd.DataFrame()

  df_all = df_all.append(read_yearly(y))
  df_all = df_all.drop_duplicates(subset=['quoteID', 'quotation', 'date', 'urls'], keep="first")

  df_washing = df_all.quotation[df_all['urls'].str.contains("washington")]


  df_washing.to_pickle('/content/drive/Shareddrives/ADA/WashingtonPost/' + str(y) + '_' + 'Washington_quotations.pkl')