In [1]:
from Constants import *

import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None) #Print full text
pd.set_option('display.max_rows', 200) #Print full text

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
en_stopwords = set(stopwords.words("english"))
en_stopwords.update([s.capitalize() for s in stopwords.words("english")])

import bz2
import json
import re
import string
import ast

from urllib.parse import urlparse
from tld import get_tld

from collections import Counter
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from scipy import sparse
from scipy.sparse import csr_matrix, dok_matrix

import pickle

In [2]:
def extract_domain(url):
    """
    Extract domain of an url
    """
    url_pruned = urlparse(url).netloc
    tld = get_tld(url, as_object=True).tld
    url_no_tld = url_pruned.replace('.'+tld,"")
    domain = url_no_tld.split('.')[-1]

    return domain

## Compute basic statistics about the quotes file

In [3]:
CHUNK_SIZE = 10_000

json_reader = pd.read_json(FILE_QUOTES,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

quotes_count = 0
unique_quotes_count = 0
not_NONE_quotes_count = 0
for (counter, df_chunk) in enumerate(json_reader):
    
    unique_quotes_count+= len(df_chunk)
    df_chunk['quote_counts']= df_chunk['urls'].apply(lambda urls: len(urls))
    quotes_count += df_chunk['quote_counts'].sum()
    not_NONE_quotes_count += df_chunk[df_chunk['speaker']!='None']['quote_counts'].sum() 
    
print(f'Number of unique quotes {unique_quotes_count}')

#We are not using num_occurences but rather the number of urls to quantify the number of quotes because
# around 3% of quotes appear multiple times in the same article which increases numOccurences but is not 
#relevant in this study
print(f'Quotes appearance count {quotes_count}')
print(f'Quotes appearance count where speaker is not None {not_NONE_quotes_count}')

Number of unique quotes 5244449
Quotes appearance count 17057653
Quotes appearance count where speaker is not None 11200295


## Speaker pipeline
From the quote file we create a TFIDF matrix of newspaper x speaker.
1) Create speaker-newspaper dataframe  
2) For each speaker preprocess its name and compute the number of times it appeared in the articles  
3) Create newspaper set and indexes  
4) Create speaker set and indexes  
5) Create frequency matrix  
6) Create TF-IDF matrix and write it to a file  

### 1) Create speaker-newspaper dataframe
df['newspaper','speaker','proba'] with possible duplicates if speaker was cited multiple times by the newspaper.
Quotes with the most probable speaker cited as None are kept and removed further down the pipeline if necessary.

In [4]:
json_reader = pd.read_json(FILE_QUOTES,lines=True,chunksize=CHUNK_SIZE,compression='bz2') 

for (counter, df_chunk) in enumerate(json_reader):

    print(f"Process chunk {counter+1}")
    
    df_chunk = df_chunk[["probas","urls","speaker"]]
    
    df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))
    
    df_chunk = df_chunk.explode("urls")
    df_chunk["newspaper"] = df_chunk["urls"].apply(extract_domain)
    
    #We keep None values and remove it further down the pipeline if needed
    df_speakers = df_chunk[["newspaper", "speaker","proba"]]
    
    add_header = (counter==0)
    write_mode = 'w' if counter == 0 else 'a'
    df_speakers.to_csv(FILE_NEWSPAPER_SPEAKER,header = add_header,index=False, mode=write_mode,compression='bz2')

Process chunk 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chunk["proba"] = df_chunk["probas"].apply(lambda probas: float(probas[0][1]))


Process chunk 2
Process chunk 3
Process chunk 4
Process chunk 5
Process chunk 6
Process chunk 7
Process chunk 8
Process chunk 9
Process chunk 10
Process chunk 11
Process chunk 12
Process chunk 13
Process chunk 14
Process chunk 15
Process chunk 16
Process chunk 17
Process chunk 18
Process chunk 19
Process chunk 20
Process chunk 21
Process chunk 22
Process chunk 23
Process chunk 24
Process chunk 25
Process chunk 26
Process chunk 27
Process chunk 28
Process chunk 29
Process chunk 30
Process chunk 31
Process chunk 32
Process chunk 33
Process chunk 34
Process chunk 35
Process chunk 36
Process chunk 37
Process chunk 38
Process chunk 39
Process chunk 40
Process chunk 41
Process chunk 42
Process chunk 43
Process chunk 44
Process chunk 45
Process chunk 46
Process chunk 47
Process chunk 48
Process chunk 49
Process chunk 50
Process chunk 51
Process chunk 52
Process chunk 53
Process chunk 54
Process chunk 55
Process chunk 56
Process chunk 57
Process chunk 58
Process chunk 59
Process chunk 60
Proce

Process chunk 464
Process chunk 465
Process chunk 466
Process chunk 467
Process chunk 468
Process chunk 469
Process chunk 470
Process chunk 471
Process chunk 472
Process chunk 473
Process chunk 474
Process chunk 475
Process chunk 476
Process chunk 477
Process chunk 478
Process chunk 479
Process chunk 480
Process chunk 481
Process chunk 482
Process chunk 483
Process chunk 484
Process chunk 485
Process chunk 486
Process chunk 487
Process chunk 488
Process chunk 489
Process chunk 490
Process chunk 491
Process chunk 492
Process chunk 493
Process chunk 494
Process chunk 495
Process chunk 496
Process chunk 497
Process chunk 498
Process chunk 499
Process chunk 500
Process chunk 501
Process chunk 502
Process chunk 503
Process chunk 504
Process chunk 505
Process chunk 506
Process chunk 507
Process chunk 508
Process chunk 509
Process chunk 510
Process chunk 511
Process chunk 512
Process chunk 513
Process chunk 514
Process chunk 515
Process chunk 516
Process chunk 517
Process chunk 518
Process ch

### 2) For each speaker preprocess its name and compute the number of times it appeared in the articles
For now, the certainty probability of a quote's speaker being correct is not taken into account and every (newspaper,speaker) pair is kept.

In [5]:
def process_speaker(speaker):
    #Lower case the names and remove all non alpha numeric characters
    new_name = speaker.lower()
    new_name = re.sub(r'[^A-Za-z0-9 ]+', '', new_name)
    return new_name

In [6]:
#The file is small enough to be loaded in memory
df = pd.read_csv(FILE_NEWSPAPER_SPEAKER,compression='bz2')

previous_speaker_count = len(df['speaker'].unique())
df['speaker'] = df['speaker'].apply(process_speaker)
processed_speaker_count = len(df['speaker'].unique())

print(f'The previous unique speaker count was {previous_speaker_count} and the new unique speaker count is {processed_speaker_count}')
print(f'This correspond to a {(previous_speaker_count-processed_speaker_count)/previous_speaker_count:.2%} reduction')

#Save to memory
df.to_csv(FILE_NEWSPAPER_SPEAKER,header = True,index=False, mode='w',compression='bz2')

The previous unique speaker count was 218415 and the new unique speaker count is 212147
This correspond to a 2.87% reduction


In [7]:
newspaper_speaker_count = df.groupby(['newspaper','speaker'],as_index=False).count()\
                            .rename(columns = {'proba':'count'})
newspaper_speaker_count.to_csv(FILE_NEWSPAPER_SPEAKER_COUNT,header = True,index=False, mode='w',compression='bz2')
newspaper_speaker_count

Unnamed: 0,newspaper,speaker,count
0,1011now,adam morfeld,4
1,1011now,adam schiff,2
2,1011now,adrian smith,2
3,1011now,alexandra brown,1
4,1011now,alfonso morales,1
...,...,...,...
2434405,zigwheels,srinivas reddy,1
2434406,zigwheels,toby price,1
2434407,zigwheels,yash aradhya,1
2434408,zip06,mike caruso,16


### 3) Create newspaper set and indexes
We create the sorted set of newspapers and the indexes to link the matrix entries to newspapers.

In [8]:
newspapers = set(newspaper_speaker_count.newspaper)
newspapers = sorted(list(newspapers))

# Newspaper name => row index
newspaper_to_index = {s:i for i,s in enumerate(newspapers)}
with open(PICKLE_NEWSPAPER_TO_INDEX, 'wb') as handle:
    pickle.dump(newspaper_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# Row index => newspaper name
index_to_newspaper = {i:s for i,s in enumerate(newspapers)}
with open(PICKLE_INDEX_TO_NEWSPAPER, 'wb') as handle:
    pickle.dump(index_to_newspaper, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 4) Create speaker set and indexes
We create the sorted set of speakers and the indexes to link the matrix entries to speakers.

In [9]:
speakers = set(newspaper_speaker_count.speaker)
speakers.remove("none")
speakers = sorted(list(speakers))

speaker_to_index = {s:i for i,s in enumerate(speakers)}
index_to_speaker = {i:s for i,s in enumerate(speakers)}

    
with open(PICKLE_SPEAKER_TO_INDEX, 'wb') as handle:
    pickle.dump(speaker_to_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(PICKLE_INDEX_TO_SPEAKER, 'wb') as handle:
    pickle.dump(index_to_speaker, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 5) Create frequency matrix
We generate a frequency < newspaper x speaker > matrix: entry (i,j) is the number of time speaker j appears in newspaper i.

In [10]:
newspaper_speaker_frequency_matrix = dok_matrix((len(newspapers), len(speakers)), dtype=np.uint32)

df_newspaper_speaker_count_not_none = newspaper_speaker_count[newspaper_speaker_count['speaker']!='none']
for index, row in df_newspaper_speaker_count_not_none.iterrows():
    if index % 10000==0:
        print(index)
    index_speaker = speaker_to_index[row['speaker']]
    index_newspaper = newspaper_to_index[row['newspaper']]
    newspaper_speaker_frequency_matrix[index_newspaper, index_speaker] = row["count"]
    
newspaper_speaker_frequency_csr = newspaper_speaker_frequency_matrix.tocsr()

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000
1080000
1090000
1100000
1110000
1120000
1130000
1140000
1150000
1160000
1170000
1180000
1190000
1200000
1210000
1220000
1230000
1240000
1250000
1260000
1270000
1280000
1290000
1300000
1310000
1320000
1330000
1340000
1350000
1360000
1370000
1380000
13

### 6) Create TF-IDF matrix and write it to a file
Transform the frequency matrix into a TF-IDF matrix. Each row is normalised and each column is scaled by proportionnaly to the number of newspaper in which the speaker is quoted.

In [11]:
transformer = TfidfTransformer()
newspaper_speaker_tfidf = transformer.fit_transform(newspaper_speaker_frequency_csr)
sparse.save_npz(FILE_NEWSPAPER_SPEAKER_TFIDF, newspaper_speaker_tfidf)

## Quotes pipeline
We create a < newspaper, token > TFIDF matrix from the quotes.  
1) Extract newspaper and token from the quotes    
2) Create token vocabulary and indexes  
3) Create frequency matrix  
4) Create TF-IDF matrix and write it to a file  

### 1) Extract newspapers and tokens from the quotes 

We read all the quotes, preprocess them, create token, write a file <newspaper: tokens> (one line per quote).

In [12]:
def preprocess(quote):
    """
    Lower first letter of the sentence
    Remove number
    Tokenize
    Remove stop words and words of len = 1
    Lemmatize
    Remove stop words and words of len = 1 (again)
    """
    
    # Lower first word of the sentence
    lower_first_word = lambda tab: " ".join([tab[0].lower()] + tab[1:])
    quote = " ".join([lower_first_word(sentence.split(" ")) for sentence in quote.split(".")])
    
    #Remove Numbers
    quote = re.sub(r'\d+', '', quote) 

    # Tokenize
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(quote)
            
    remove_stop_words = lambda wt: [w for w in wt if not w in en_stopwords and len(w) > 1]
    # Remove stop words and single letters
    word_tokens = remove_stop_words(word_tokens)

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    # Remove stop words and single letters
    word_tokens = remove_stop_words(word_tokens)
        
    return word_tokens

In [13]:
json_reader = pd.read_json(FILE_QUOTES,lines=True,chunksize=10_000,compression='bz2') 

for (counter, df_chunk) in enumerate(json_reader):
    print(f"Chunk: {counter}")
    
    # Preprocess all quotes
    df_chunk["tokens"] = df_chunk["quotation"].apply(preprocess)
    
    # Transform url to newpaper acronyme
    df_chunk["newspapers"] = df_chunk["urls"].apply(lambda ln: [extract_domain(n) for n in ln])
    
    # Row = [newspaper list] [preprocessed quotation]
    df_quotes = df_chunk[["newspapers", "tokens"]]

    # Write result chunk by chunk in a csv compressed file
    add_header = (counter==0)
    write_mode = 'w' if counter == 0 else 'a'
    df_quotes.to_csv(FILE_NEWSPAPER_TOKEN,header = add_header,index=False, mode=write_mode,compression='bz2')

Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk: 98
Chunk: 99
Chunk: 100

### 2) Create token vocabulary and indexes
We create the sorted set of tokens and the indexes to link the matrix entries to tokens.
We iterate over all the quotes to create the vocabulary

In [14]:
csv_reader = pd.read_csv(FILE_NEWSPAPER_TOKEN, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

#Create a Counter of all tokens
vocabulary = Counter()

for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    # Count all tokens in the chunk (multiply a token by the number of newspaper quoting the quote)
    vocabulary = vocabulary +  Counter(chain.from_iterable(df_chunk.explode("newspapers")["tokens"]))

# Removing token that appears in only one newspaper
processed_voc = list(np.array(list(vocabulary.keys()))[np.array(list(vocabulary.values()))!=1])

Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk: 98
Chunk: 99
Chunk: 100

In [15]:
sorted_voc = sorted(processed_voc)

token_to_index = {n:i for i, n in enumerate(sorted_voc)}
index_to_token = {i:n for i, n in enumerate(sorted_voc)}

with open(PICKLE_TOKEN_TO_INDEX, 'wb') as handle:
    pickle.dump(token_to_index,handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(PICKLE_INDEX_TO_TOKEN, 'wb') as handle:
    pickle.dump(index_to_token, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 3) Create frequency matrix
We generate a frequency matrix #newspaper x #tokens: entry (i,j) is the number of time token j appears in newspaper i.

In [16]:
csv_reader = pd.read_csv(FILE_NEWSPAPER_TOKEN, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

def dummy(doc):
    return doc
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, vocabulary=sorted_voc) 

# Create DataFrame
df = pd.DataFrame({'newspapers':newspapers})

for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    
    df_exploded = df_chunk.explode("newspapers")
    
    # Create dataframe with all the tokens per newspaper
    df_grouped = df_exploded.groupby("newspapers")["tokens"].apply(sum).reset_index() 
    
    # Join previous dataframe with a dumb dataframe containing all the newspaper as index
    # => Add empty newspaper, allow to create frequency matrix with the correct index for newspaper
    df_join = df.set_index('newspapers').join(df_grouped.set_index('newspapers'))
    df_join["tokens"] = np.where(df_join["tokens"].isna(), [""], df_join["tokens"])
    
    # Create token frequency vector by newspaper
    X = vectorizer.fit_transform(df_join["tokens"])
    
    # Sum all the token x newspaper frequency matrix
    if(counter == 0):
        newspaper_token_frequency = X
    else:
        newspaper_token_frequency += X

Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk: 98
Chunk: 99
Chunk: 100

### 4) Create TF-IDF matrix and write it to a file
Transform the frequency matrix into a TF-IDF matrix. Each row is normalised and each column is scaled by proportionnaly to the number of newspaper in which the speaker is quoted.

In [17]:
transformer = TfidfTransformer()
newspaper_token_tfidf = transformer.fit_transform(newspaper_token_frequency)
sparse.save_npz(FILE_NEWSPAPER_TOKEN_TFIDF, newspaper_token_tfidf)

## Stats about the processed data

In [20]:
df = newspaper_speaker_count

print(f'They are {len(df)} pairs of (newspapers,speakers)')
newspapers = set(df['newspaper'])
print(f'They are {len(newspapers)} unique newspapers')
speakers = set(df['speaker'])
print(f'They are {len(speakers)} unique speakers')

They are 2434410 pairs of (newspapers,speakers)
They are 7362 unique newspapers
They are 212146 unique speakers


#### Number of time a speaker has been quoted

In [21]:
speaker_total_occurence = df.groupby('speaker',as_index=False).aggregate({'count':'sum'})\
                                .rename(columns = {'count':'total_quotes'})
speaker_total_occurence.nlargest(15, columns=['total_quotes'])

Unnamed: 0,speaker,total_quotes
149814,none,5857313
161395,president donald trump,261722
24574,boris johnson,86315
94541,joe biden,84953
20952,bernie sanders,69722
194583,tedros adhanom ghebreyesus,67672
180703,scott morrison,60324
10674,andrew cuomo,54054
161405,president trump,53481
14547,anthony fauci,49239


#### Number of unique speaker by newspaper

In [22]:
newspaper_speaker_count = df.groupby('newspaper',as_index=False).aggregate({'speaker':'count'})\
                            .rename(columns = {'speaker':'speaker_count'})
newspaper_speaker_count.nlargest(15, columns=['speaker_count'])

Unnamed: 0,newspaper,speaker_count
4055,msn,16544
784,breitbart,9434
6872,washingtontimes,9239
5439,sfgate,8864
2954,indiatimes,7304
4518,nytimes,7277
5405,seattletimes,6951
2938,independent,6501
5686,stamfordadvocate,6426
890,businessinsider,6422
