In [58]:
import pandas as pd
import bz2
import json
from urllib.parse import urlparse
import re
import numpy as np
from tld import get_tld

pd.set_option('display.max_colwidth', None) #Print full text
pd.set_option('display.max_rows', 100) #Print full text

In [68]:
DATA_FOLDER = 'data/'
PROCESSED_DATA_FOLDER = 'processed_data/'

QUOTES_FILE = DATA_FOLDER + 'quotes-2020.json.bz2'
CHUNK_SIZE = 10_000

### Utility Functions

In [5]:
def extract_newspaper(url):
    
    url_pruned = urlparse(url).netloc
    tld = get_tld(url, as_object=True).tld
    url_no_tld = url_pruned.replace('.'+tld,"")
    domain = url_no_tld.split('.')[-1]
    
    return domain

### Quotes count statistics 

In [96]:
json_reader = pd.read_json(QUOTES_FILE,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

quotes_count = 0
unique_quotes_count = 0
not_NONE_quotes_count = 0
for (counter, df_chunk) in enumerate(json_reader):
    
    unique_quotes_count+= len(df_chunk)
    df_chunk['quote_counts']= df_chunk['urls'].apply(lambda urls: len(urls))
    quotes_count += df_chunk['quote_counts'].sum()
    not_NONE_quotes_count += df_chunk[df_chunk['speaker']!='None']['quote_counts'].sum() 
    
    print(f'Iteration {counter+1}')
    print(f'Unique quotes count {unique_quotes_count}')
    print(f'Quotes count {quotes_count}')
    print(f'Not NONE quotes count {not_NONE_quotes_count}')    

Iteration 1
Unique quotes count 10000
Quotes count 32970
Not NONE quotes count 21869
Iteration 2
Unique quotes count 20000
Quotes count 67029
Not NONE quotes count 44352
Iteration 3
Unique quotes count 30000
Quotes count 99716
Not NONE quotes count 65737
Iteration 4
Unique quotes count 40000
Quotes count 133042
Not NONE quotes count 87360
Iteration 5
Unique quotes count 50000
Quotes count 166398
Not NONE quotes count 109730
Iteration 6
Unique quotes count 60000
Quotes count 201008
Not NONE quotes count 131893
Iteration 7
Unique quotes count 70000
Quotes count 232867
Not NONE quotes count 152779
Iteration 8
Unique quotes count 80000
Quotes count 262311
Not NONE quotes count 171819
Iteration 9
Unique quotes count 90000
Quotes count 294956
Not NONE quotes count 194672
Iteration 10
Unique quotes count 100000
Quotes count 326159
Not NONE quotes count 215138
Iteration 11
Unique quotes count 110000
Quotes count 358695
Not NONE quotes count 236464
Iteration 12
Unique quotes count 120000
Quotes

Iteration 93
Unique quotes count 930000
Quotes count 3020328
Not NONE quotes count 1984890
Iteration 94
Unique quotes count 940000
Quotes count 3052080
Not NONE quotes count 2006481
Iteration 95
Unique quotes count 950000
Quotes count 3084852
Not NONE quotes count 2027938
Iteration 96
Unique quotes count 960000
Quotes count 3117657
Not NONE quotes count 2049308
Iteration 97
Unique quotes count 970000
Quotes count 3150044
Not NONE quotes count 2070314
Iteration 98
Unique quotes count 980000
Quotes count 3182374
Not NONE quotes count 2091004
Iteration 99
Unique quotes count 990000
Quotes count 3217409
Not NONE quotes count 2114362
Iteration 100
Unique quotes count 1000000
Quotes count 3251315
Not NONE quotes count 2138377
Iteration 101
Unique quotes count 1010000
Quotes count 3284549
Not NONE quotes count 2160409
Iteration 102
Unique quotes count 1020000
Quotes count 3316111
Not NONE quotes count 2181250
Iteration 103
Unique quotes count 1030000
Quotes count 3350817
Not NONE quotes count

Iteration 182
Unique quotes count 1820000
Quotes count 5899713
Not NONE quotes count 3875343
Iteration 183
Unique quotes count 1830000
Quotes count 5932024
Not NONE quotes count 3896389
Iteration 184
Unique quotes count 1840000
Quotes count 5965044
Not NONE quotes count 3918286
Iteration 185
Unique quotes count 1850000
Quotes count 5998091
Not NONE quotes count 3940070
Iteration 186
Unique quotes count 1860000
Quotes count 6030646
Not NONE quotes count 3961756
Iteration 187
Unique quotes count 1870000
Quotes count 6065971
Not NONE quotes count 3984115
Iteration 188
Unique quotes count 1880000
Quotes count 6096211
Not NONE quotes count 4003900
Iteration 189
Unique quotes count 1890000
Quotes count 6129226
Not NONE quotes count 4026834
Iteration 190
Unique quotes count 1900000
Quotes count 6162261
Not NONE quotes count 4048608
Iteration 191
Unique quotes count 1910000
Quotes count 6192923
Not NONE quotes count 4068390
Iteration 192
Unique quotes count 1920000
Quotes count 6224625
Not NON

Iteration 271
Unique quotes count 2710000
Quotes count 8809677
Not NONE quotes count 5794702
Iteration 272
Unique quotes count 2720000
Quotes count 8840672
Not NONE quotes count 5814923
Iteration 273
Unique quotes count 2730000
Quotes count 8875575
Not NONE quotes count 5837149
Iteration 274
Unique quotes count 2740000
Quotes count 8907205
Not NONE quotes count 5858746
Iteration 275
Unique quotes count 2750000
Quotes count 8939698
Not NONE quotes count 5879563
Iteration 276
Unique quotes count 2760000
Quotes count 8972805
Not NONE quotes count 5900741
Iteration 277
Unique quotes count 2770000
Quotes count 9004628
Not NONE quotes count 5920791
Iteration 278
Unique quotes count 2780000
Quotes count 9037857
Not NONE quotes count 5942685
Iteration 279
Unique quotes count 2790000
Quotes count 9069325
Not NONE quotes count 5963368
Iteration 280
Unique quotes count 2800000
Quotes count 9101131
Not NONE quotes count 5984456
Iteration 281
Unique quotes count 2810000
Quotes count 9136580
Not NON

Iteration 359
Unique quotes count 3590000
Quotes count 11688635
Not NONE quotes count 7677635
Iteration 360
Unique quotes count 3600000
Quotes count 11722192
Not NONE quotes count 7699098
Iteration 361
Unique quotes count 3610000
Quotes count 11752550
Not NONE quotes count 7719436
Iteration 362
Unique quotes count 3620000
Quotes count 11786245
Not NONE quotes count 7742190
Iteration 363
Unique quotes count 3630000
Quotes count 11817419
Not NONE quotes count 7762812
Iteration 364
Unique quotes count 3640000
Quotes count 11849511
Not NONE quotes count 7784458
Iteration 365
Unique quotes count 3650000
Quotes count 11884781
Not NONE quotes count 7806389
Iteration 366
Unique quotes count 3660000
Quotes count 11917476
Not NONE quotes count 7827026
Iteration 367
Unique quotes count 3670000
Quotes count 11949752
Not NONE quotes count 7849315
Iteration 368
Unique quotes count 3680000
Quotes count 11983179
Not NONE quotes count 7871515
Iteration 369
Unique quotes count 3690000
Quotes count 12015

Iteration 447
Unique quotes count 4470000
Quotes count 14552805
Not NONE quotes count 9553670
Iteration 448
Unique quotes count 4480000
Quotes count 14583499
Not NONE quotes count 9573815
Iteration 449
Unique quotes count 4490000
Quotes count 14615372
Not NONE quotes count 9594664
Iteration 450
Unique quotes count 4500000
Quotes count 14645709
Not NONE quotes count 9614575
Iteration 451
Unique quotes count 4510000
Quotes count 14681186
Not NONE quotes count 9637252
Iteration 452
Unique quotes count 4520000
Quotes count 14712593
Not NONE quotes count 9658003
Iteration 453
Unique quotes count 4530000
Quotes count 14743858
Not NONE quotes count 9677994
Iteration 454
Unique quotes count 4540000
Quotes count 14777572
Not NONE quotes count 9699707
Iteration 455
Unique quotes count 4550000
Quotes count 14809979
Not NONE quotes count 9721793
Iteration 456
Unique quotes count 4560000
Quotes count 14841536
Not NONE quotes count 9742579
Iteration 457
Unique quotes count 4570000
Quotes count 14873

In [97]:
print(f'Number of unique quotes {unique_quotes_count}')

#We are not using num_occurences but rather the number of urls to quantify the number of quotes because
# around 3% of quotes appear multiple times in the same article which increases numOccurences but is not 
#relevant in this study
print(f'Quotes appearance count {quotes_count}')
print(f'Quotes appearance count where speaker is not None {not_NONE_quotes_count}')

Number of unique quotes 5244449
Quotes appearance count 17057653
Quotes appearance count where speaker is not None 11200295


### Create speaker-newspaper dataframe
df['newspaper','speaker','proba'] with possible duplicates if speaker was cited multiple times by the newspaper.
Quotes with the most probable speaker cited as None are kept and removed furhter down the pipeline if necessary.

In [159]:
FILE_SPEAKERS = PROCESSED_DATA_FOLDER +"processed-speaker-2020.csv.bz2"

json_reader = pd.read_json(QUOTES_FILE,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

for (counter, df_chunk) in enumerate(json_reader):

    print(f"Process chunk {counter+1}")
    
    df_chunk = df_chunk[["probas","urls","speaker"]]
    
    df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))
    
    df_chunk = df_chunk.explode("urls")
    df_chunk["newspaper"] = df_chunk["urls"].apply(extract_newspaper)
    
    #df_speakers = df_chunk[df_chunk["speaker"] != "None"][["newspaper", "speaker","proba"]]
    #We keep None values and remove it further down the pipeline
    df_speakers = df_chunk[["newspaper", "speaker","proba"]]
    
    add_header = (counter==0)
    write_mode = 'w' if counter == 0 else 'a'
    df_speakers.to_csv(FILE_SPEAKERS,header = add_header,index=False, mode=write_mode,compression='bz2')

Process chunk 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))


Process chunk 2
Process chunk 3
Process chunk 4
Process chunk 5
Process chunk 6
Process chunk 7
Process chunk 8
Process chunk 9
Process chunk 10
Process chunk 11
Process chunk 12
Process chunk 13
Process chunk 14
Process chunk 15
Process chunk 16
Process chunk 17
Process chunk 18
Process chunk 19
Process chunk 20
Process chunk 21
Process chunk 22
Process chunk 23
Process chunk 24
Process chunk 25
Process chunk 26
Process chunk 27
Process chunk 28
Process chunk 29
Process chunk 30
Process chunk 31
Process chunk 32
Process chunk 33
Process chunk 34
Process chunk 35
Process chunk 36
Process chunk 37
Process chunk 38
Process chunk 39
Process chunk 40
Process chunk 41
Process chunk 42
Process chunk 43
Process chunk 44
Process chunk 45
Process chunk 46
Process chunk 47
Process chunk 48
Process chunk 49
Process chunk 50
Process chunk 51
Process chunk 52
Process chunk 53
Process chunk 54
Process chunk 55
Process chunk 56
Process chunk 57
Process chunk 58
Process chunk 59
Process chunk 60
Proce

Process chunk 464
Process chunk 465
Process chunk 466
Process chunk 467
Process chunk 468
Process chunk 469
Process chunk 470
Process chunk 471
Process chunk 472
Process chunk 473
Process chunk 474
Process chunk 475
Process chunk 476
Process chunk 477
Process chunk 478
Process chunk 479
Process chunk 480
Process chunk 481
Process chunk 482
Process chunk 483
Process chunk 484
Process chunk 485
Process chunk 486
Process chunk 487
Process chunk 488
Process chunk 489
Process chunk 490
Process chunk 491
Process chunk 492
Process chunk 493
Process chunk 494
Process chunk 495
Process chunk 496
Process chunk 497
Process chunk 498
Process chunk 499
Process chunk 500
Process chunk 501
Process chunk 502
Process chunk 503
Process chunk 504
Process chunk 505
Process chunk 506
Process chunk 507
Process chunk 508
Process chunk 509
Process chunk 510
Process chunk 511
Process chunk 512
Process chunk 513
Process chunk 514
Process chunk 515
Process chunk 516
Process chunk 517
Process chunk 518
Process ch

### For each speaker, compute the number of times it appeared in the articles
For now, the certainty probability of a quote's speaker being correct is not taken into account and every (newspaper,speaker) pair is kept.

In [160]:
NEWSPAPER_SPEAKER_COUNT_FILE = PROCESSED_DATA_FOLDER+"speaker-newspaper-2020.csv.bz2"

In [161]:
df = pd.read_csv(FILE_SPEAKERS,compression='bz2')
newspaper_speaker_count = df.groupby(['newspaper','speaker'],as_index=False).count()\
                            .rename(columns = {'proba':'count'})

In [162]:
newspaper_speaker_count

Unnamed: 0,newspaper,speaker,count
0,1011now,Adam Morfeld,4
1,1011now,Adam Schiff,2
2,1011now,Adrian Smith,2
3,1011now,Alexandra Brown,1
4,1011now,Alfonso Morales,1
...,...,...,...
2446066,zigwheels,Srinivas Reddy,1
2446067,zigwheels,Toby Price,1
2446068,zigwheels,Yash Aradhya,1
2446069,zip06,Mike Caruso,16


In [163]:
newspaper_speaker_count.to_csv(NEWSPAPER_SPEAKER_COUNT_FILE,header = True,index=False, mode='w',compression='bz2')

### Compute overall statistic on newspapers and speakers
Important no threshold has been applied on speaker

In [164]:
df = pd.read_csv(NEWSPAPER_SPEAKER_COUNT_FILE,compression='bz2')

In [165]:
df.columns

Index(['newspaper', 'speaker', 'count'], dtype='object')

In [166]:
print(f'They are {len(df)} pairs of (newspapers,speakers)')
newspapers = set(newspaper_speaker_count['newspaper'])
print(f'They are {len(newspapers)} unique newspapers')
speakers = set(newspaper_speaker_count['speaker'])
print(f'They are {len(speakers)} unique speakers')

They are 2446071 pairs of (newspapers,speakers)
They are 7362 unique newspapers
They are 218414 unique speakers


In [167]:
speaker_unique_occurence = df[['newspaper','speaker']].groupby('speaker',as_index=False).count()\
                    .rename(columns = {'newspaper':'unique_newspaper_quotes'})


In [168]:
speaker_unique_occurence.nlargest(100,columns=['unique_newspaper_quotes'])

Unnamed: 0,speaker,unique_newspaper_quotes
153120,,6999
164974,President Donald Trump,2822
164984,President Trump,2030
198999,Tedros Adhanom Ghebreyesus,1954
25136,Boris Johnson,1890
21523,Bernie Sanders,1813
14907,Anthony Fauci,1755
10955,Andrew Cuomo,1707
96844,Joe Biden,1681
148530,Nancy Pelosi,1665


In [169]:
speaker_unique_occurence.describe()

Unnamed: 0,unique_newspaper_quotes
count,218414.0
mean,11.199241
std,43.472728
min,1.0
25%,1.0
50%,2.0
75%,6.0
max,6999.0


In [170]:
speaker_total_occurence = df.groupby('speaker',as_index=False).aggregate({'count':'sum'})\
                                .rename(columns = {'count':'total_newspaper_quotes'})

In [171]:
speaker_total_occurence.nlargest(100,columns=['total_newspaper_quotes'])

Unnamed: 0,speaker,total_newspaper_quotes
153120,,5857313
164974,President Donald Trump,259733
25136,Boris Johnson,86130
96844,Joe Biden,84389
21523,Bernie Sanders,69519
198999,Tedros Adhanom Ghebreyesus,67662
184844,Scott Morrison,59476
10955,Andrew Cuomo,53986
164984,President Trump,53382
14907,Anthony Fauci,49207


In [172]:
speaker_total_occurence.describe()

Unnamed: 0,total_newspaper_quotes
count,218414.0
mean,78.09723
std,12558.78
min,1.0
25%,2.0
50%,6.0
75%,19.0
max,5857313.0


In [173]:
newspaper_speaker_count = df.groupby('newspaper',as_index=False).aggregate({'speaker':'count'})\
                            .rename(columns = {'speaker':'speaker_count'})

In [174]:
newspaper_speaker_count.nlargest(100,columns=['speaker_count'])

Unnamed: 0,newspaper,speaker_count
4055,msn,16656
784,breitbart,9485
6872,washingtontimes,9271
5439,sfgate,8896
2954,indiatimes,7340
4518,nytimes,7299
5405,seattletimes,6982
2938,independent,6537
5686,stamfordadvocate,6454
890,businessinsider,6440


In [175]:
newspaper_speaker_count.describe()

Unnamed: 0,speaker_count
count,7362.0
mean,332.256316
std,803.901198
min,1.0
25%,11.0
50%,53.0
75%,218.75
max,16656.0


In [176]:
newspaper_quote_count = df.groupby('newspaper',as_index=False).aggregate({'count':'sum'})\
                        .rename(columns={'count':'quote_count'})

In [177]:
newspaper_quote_count.nlargest(100,columns=['quote_count'])

Unnamed: 0,newspaper,quote_count
1890,einnews,267567
4055,msn,240577
4289,news965,127998
7152,wokv,119616
890,businessinsider,102126
784,breitbart,98428
6872,washingtontimes,81833
7222,wsbradio,77257
2954,indiatimes,75690
4212,nbcsports,72913


In [178]:
newspaper_quote_count.describe()

Unnamed: 0,quote_count
count,7362.0
mean,2316.969302
std,7791.179536
min,1.0
25%,34.0
50%,221.5
75%,1117.5
max,267567.0


#### Other stats:
Repartition of speaker in newspaper (eg: 50% trump, ...)  
Show and then tf-idf

### Thresholding decisions

If we keep everything, the newspaper-speaker_count matrix will be of size:  
newspapers_count x speaker_count = 7148 x 218 413 = 1 561 216 124 which is huge even though the obtained matrix will be sparse.  
We must decide which newspaper to keep  
We must decide which speakers to keep  
We decide probability threshold for a speaker to be correctly attributed to a quote.  