In [67]:
import pandas as pd
import numpy as np
import bz2
import json
import re
import Constants

from urllib.parse import urlparse
from tld import get_tld
from Helper import extract_domain

pd.set_option('display.max_colwidth', None) #Print full text
pd.set_option('display.max_rows', 200) #Print full text

CHUNK_SIZE = 10_000

### Compute basic files  statistics

In [None]:
json_reader = pd.read_json(FILE_QUOTES,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

quotes_count = 0
unique_quotes_count = 0
not_NONE_quotes_count = 0
for (counter, df_chunk) in enumerate(json_reader):
    
    unique_quotes_count+= len(df_chunk)
    df_chunk['quote_counts']= df_chunk['urls'].apply(lambda urls: len(urls))
    quotes_count += df_chunk['quote_counts'].sum()
    not_NONE_quotes_count += df_chunk[df_chunk['speaker']!='None']['quote_counts'].sum() 

In [97]:
print(f'Number of unique quotes {unique_quotes_count}')

#We are not using num_occurences but rather the number of urls to quantify the number of quotes because
# around 3% of quotes appear multiple times in the same article which increases numOccurences but is not 
#relevant in this study
print(f'Quotes appearance count {quotes_count}')
print(f'Quotes appearance count where speaker is not None {not_NONE_quotes_count}')

Number of unique quotes 5244449
Quotes appearance count 17057653
Quotes appearance count where speaker is not None 11200295


### Create speaker-newspaper dataframe
df['newspaper','speaker','proba'] with possible duplicates if speaker was cited multiple times by the newspaper.
Quotes with the most probable speaker cited as None are kept and removed furhter down the pipeline if necessary.

In [159]:
json_reader = pd.read_json(FILE_QUOTES,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

for (counter, df_chunk) in enumerate(json_reader):

    print(f"Process chunk {counter+1}")
    
    df_chunk = df_chunk[["probas","urls","speaker"]]
    
    df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))
    
    df_chunk = df_chunk.explode("urls")
    df_chunk["newspaper"] = df_chunk["urls"].apply(extract_domain)
    
    #We keep None values and remove it further down the pipeline if needed
    df_speakers = df_chunk[["newspaper", "speaker","proba"]]
    
    add_header = (counter==0)
    write_mode = 'w' if counter == 0 else 'a'
    df_speakers.to_csv(NEWSPAPER_SPEAKERS_FILE,header = add_header,index=False, mode=write_mode,compression='bz2')

Process chunk 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))


Process chunk 2
Process chunk 3
Process chunk 4
Process chunk 5
Process chunk 6
Process chunk 7
Process chunk 8
Process chunk 9
Process chunk 10
Process chunk 11
Process chunk 12
Process chunk 13
Process chunk 14
Process chunk 15
Process chunk 16
Process chunk 17
Process chunk 18
Process chunk 19
Process chunk 20
Process chunk 21
Process chunk 22
Process chunk 23
Process chunk 24
Process chunk 25
Process chunk 26
Process chunk 27
Process chunk 28
Process chunk 29
Process chunk 30
Process chunk 31
Process chunk 32
Process chunk 33
Process chunk 34
Process chunk 35
Process chunk 36
Process chunk 37
Process chunk 38
Process chunk 39
Process chunk 40
Process chunk 41
Process chunk 42
Process chunk 43
Process chunk 44
Process chunk 45
Process chunk 46
Process chunk 47
Process chunk 48
Process chunk 49
Process chunk 50
Process chunk 51
Process chunk 52
Process chunk 53
Process chunk 54
Process chunk 55
Process chunk 56
Process chunk 57
Process chunk 58
Process chunk 59
Process chunk 60
Proce

Process chunk 464
Process chunk 465
Process chunk 466
Process chunk 467
Process chunk 468
Process chunk 469
Process chunk 470
Process chunk 471
Process chunk 472
Process chunk 473
Process chunk 474
Process chunk 475
Process chunk 476
Process chunk 477
Process chunk 478
Process chunk 479
Process chunk 480
Process chunk 481
Process chunk 482
Process chunk 483
Process chunk 484
Process chunk 485
Process chunk 486
Process chunk 487
Process chunk 488
Process chunk 489
Process chunk 490
Process chunk 491
Process chunk 492
Process chunk 493
Process chunk 494
Process chunk 495
Process chunk 496
Process chunk 497
Process chunk 498
Process chunk 499
Process chunk 500
Process chunk 501
Process chunk 502
Process chunk 503
Process chunk 504
Process chunk 505
Process chunk 506
Process chunk 507
Process chunk 508
Process chunk 509
Process chunk 510
Process chunk 511
Process chunk 512
Process chunk 513
Process chunk 514
Process chunk 515
Process chunk 516
Process chunk 517
Process chunk 518
Process ch

### For each speaker preprocess its name and compute the number of times it appeared in the articles
For now, the certainty probability of a quote's speaker being correct is not taken into account and every (newspaper,speaker) pair is kept.

In [46]:
def process_speaker(speaker):
    #Lower case the names and remove all non alpha numeric characters
    new_name = speaker.lower()
    new_name = re.sub(r'[^A-Za-z0-9 ]+', '', new_name)
    return new_name

In [48]:
#The file is small enough to be loaded in memory
df = pd.read_csv(FILE_NEWSPAPER_SPEAKER,compression='bz2')
previous_speaker_count = len(df['speaker'].unique())

In [49]:
df['speaker'] = df['speaker'].apply(process_speaker)

In [56]:
processed_speaker_count = len(df['speaker'].unique())

print(f'The previous unique speaker count was {previous_speaker_count} and the new unique speaker count is {processed_speaker_count}')
print(f'This correspond to a {(previous_speaker_count-processed_speaker_count)/previous_speaker_count:.2%} reduction')

The previous unique speaker count was 218415 and the new unique speaker count is 212147
This correspond to a 2.87% reduction


In [57]:
newspaper_speaker_count = df.groupby(['newspaper','speaker'],as_index=False).count()\
                            .rename(columns = {'proba':'count'})

In [58]:
newspaper_speaker_count

Unnamed: 0,newspaper,speaker,count
0,1011now,adam morfeld,4
1,1011now,adam schiff,2
2,1011now,adrian smith,2
3,1011now,alexandra brown,1
4,1011now,alfonso morales,1
...,...,...,...
2434405,zigwheels,srinivas reddy,1
2434406,zigwheels,toby price,1
2434407,zigwheels,yash aradhya,1
2434408,zip06,mike caruso,16


In [61]:
newspaper_speaker_count.to_csv(FILE_NEWSPAPER_SPEAKER_COUNT,header = True,index=False, mode='w',compression='bz2')