# Data final extraction
With data wrangling inside the sample creation

In [1]:
#importing the required modules
import seaborn as sns
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# List of years available and used
years = [2015,2016,2017,2018,2019,2020]

# Size of years file in GB
year_size = np.array([3.3,2.3,5.2,4.8,3.6,.9])

# Number of rows in file of 2020 (computed once "by hand", as long as this file is not too heavy)
rows2020 = 5244449

# Estimation of number of rows in each file
year_rows = year_size*rows2020/.9

# Chunk sizes
chunk_sizes = [1e6,1e5,1e5,1e5,1e5,1e5]

# Which correspond to n chunks of size one million
chunks_number = np.rint(year_rows/chunk_sizes).astype(int)

# Which means we have to take n random rows in each chunk to get a sample of 1,600,000 rows per year
rows_in_chunk = np.rint(1.6*1e6/chunks_number).astype(int)

Sanity check :

In [3]:
rows_in_chunk, chunks_number, rows_in_chunk*chunks_number

(array([84211, 11940,  5281,  5714,  7619, 30769]),
 array([ 19, 134, 303, 280, 210,  52]),
 array([1600009, 1599960, 1600143, 1599920, 1599990, 1599988]))

In [4]:
# Data files all have the following columns
Index = ['quoteID', 'quotation', 'speaker', 'qids', 'date', 'numOccurrences','probas', 'urls', 'phase','p1','p2','delta_p','year']

In [5]:
# Useful function, when testing loops
def process_chunk(chunk,year):
        print(f'Processing chunk with {len(chunk)} rows, in file from year {year}')
        # print(chunk.columns)

In [6]:
#fixing the thresholds (=> to low percentage, resp. percentage difference, of attribution to be considered in the analysis)
threshold_min = 0.5
threshold_diff = 0.3


print("C'est parti, mon kiki !\n")

for iii, year in enumerate(years) :
    with pd.read_json(f'./Quotebank/quotes-{year}.json.bz2', lines=True, compression='bz2', chunksize=chunk_sizes[iii]) as df_reader:
        print(f"Start year {year} with chunks of size {int(chunk_sizes[iii])}")
        df = pd.DataFrame(columns=Index)
        for chunk in df_reader:
            
            # Take only rows with at least two possible speakers
            IndexP1 = chunk[chunk['probas'].str.len() < 2].index
            chunk = chunk.drop(IndexP1 , inplace=False)
            
            #highest probability
            chunk['p1'] = [i[0][1] for i in chunk['probas']]
            chunk['p1'] = chunk['p1'].astype(float)

            #second highest probability
            chunk['p2'] = [i[1][1] for i in chunk['probas']]
            chunk['p2'] = chunk['p2'].astype(float)

            #difference between the two above
            chunk['delta_p'] = chunk['p1']-chunk['p2']

            #extracting the date
            chunk['year'] = pd.DatetimeIndex(chunk['date']).year
            
            
            # Removing the rows that not pass the criterions
            indexNames = chunk[
                                    (chunk['p1'] < threshold_min)
                                  | (chunk['delta_p'] < threshold_diff)
                                  | (chunk['speaker'] == 'None')
                                                                                      ].index

            chunk = chunk.drop(indexNames , inplace=False)
            
            df = pd.concat([df,chunk.sample(min(rows_in_chunk[iii],len(chunk)))])
        
        df = df.drop_duplicates(subset = 'quoteID', keep='first')
        df = df.reset_index(drop=True)
        df.to_json(f"./Quotebank/Sample_{year}_wrangled.json.bz2",compression="bz2",lines=True,orient="records")
        print(f"Sanitiy check : len of df = {len(df)}")
        print(f"Done with year {year}")
        



print("\nTout est bien qui finit bien")

C'est parti, mon kiki !

Start year 2015 with chunks of size 1000000
Sanitiy check : len of df = 1768431
Done with year 2015
Start year 2016 with chunks of size 100000
Sanitiy check : len of df = 1659660
Done with year 2016
Start year 2017 with chunks of size 100000
Sanitiy check : len of df = 1410027
Done with year 2017
Start year 2018 with chunks of size 100000
Sanitiy check : len of df = 1559922
Done with year 2018
Start year 2019 with chunks of size 100000
Sanitiy check : len of df = 1660942
Done with year 2019
Start year 2020 with chunks of size 100000
Sanitiy check : len of df = 1623688
Done with year 2020

Tout est bien qui finit bien


In [2]:
# Small adjustments to default style of plots, making sure it's readable and colorblind-friendly everywhere
plt.style.use('seaborn-colorblind')
plt.rcParams.update({'font.size' : 12.5,
                     'figure.figsize':(10,7)})

In [None]:
%%time
year = 2015
df = pd.read_json(f'./Quotebank/Sample_{year}_wrangled.json.bz2', lines=True, compression='bz2')


In [None]:
#boxplot to analyze the presence of outliers
data_plot = df.drop(['numOccurrences','year'], axis = 1)
ax = sns.boxplot(data = data_plot)
ax.set(ylabel='probabilities [-]')
ax.set_title(f'Distribution of the probability of the speaker attribution {year}')
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.savefig(f"{year}.svg")