## Create TF-IDF matrix

In [16]:
import pandas as pd
import numpy as np
import ast

from collections import Counter
from itertools import chain

from scipy import sparse
from scipy.sparse import csc_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

FILE_QUOTES = "nps-tokens-2020.csv.bz2"
FILE_SPEAKER_NEWSPAPER = "speaker-newspaper-2020.csv.bz2"

In [4]:
#===================================== Create newspaper set =====================================#
print("Create newspaper set")

df_speaker_newspaper = pd.read_csv(FILE_SPEAKER_NEWSPAPER, compression='bz2') 
newspapers = set(df_speaker_newspaper.newspaper)

Create newspaper set


In [5]:
#===================================== Create token vocabulary =====================================#
print("Create token vocabulary")

vocabulary = Counter()

csv_reader = pd.read_csv(FILE_QUOTES, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

#Create a Counter of all tokens
for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    vocabulary = vocabulary +  Counter(chain.from_iterable(df_chunk.explode("newspapers")["tokens"]))

processed_voc = list(np.array(list(vocabulary.keys()))[np.array(list(vocabulary.values()))!=1])

Create token vocabulary
Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk:

In [6]:
#===================================== Create frequency matrix =====================================#
print("Create frequency matrix")

csv_reader = pd.read_csv(FILE_QUOTES, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

newspapers = sorted(list(newspapers))
newspapers_to_index = {n:i for i, n in enumerate(newspapers)}
index_to_newspapers = {i:n for i, n in enumerate(newspapers)}
token_to_index = {n:i for i, n in enumerate(newspapers)}
index_to_token = {i:n for i, n in enumerate(newspapers)}

def dummy(doc):
    return doc
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, vocabulary=processed_voc) 

# Create DataFrame
df = pd.DataFrame({'newspapers':newspapers})

for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    
    df_exploded = df_chunk.explode("newspapers")
    
    # Create dataframe with all the tokens per newspaper
    df_grouped = df_exploded.groupby("newspapers")["tokens"].apply(sum).reset_index() 
    
    # Join previous dataframe with a dumb dataframe containing all the newspaper as index
    # => Add empty newspaper, allow to create frequency matrix with the correct index for newspaper
    df_join = df.set_index('newspapers').join(df_grouped.set_index('newspapers'))
    df_join["tokens"] = np.where(df_join["tokens"].isna(), [""], df_join["tokens"])
    
    # Create token frequency vector by newspaper
    X = vectorizer.fit_transform(df_join["tokens"])
    
    # Sum all the token x newspaper frequency matrix
    if(counter == 0):
        newspaper_token_frequency = X
    else:
        newspaper_token_frequency += X

Create frequency matrix
Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk:

In [7]:
#===================================== Create TF-IDF matrix =====================================#
print("Create TF-IDF matrix")

transformer = TfidfTransformer()
newspaper_token_tfidf = transformer.fit_transform(newspaper_token_frequency)

Create TF-IDF matrix


In [17]:
sparse.save_npz("newspaper_token_tfidf.npz", newspaper_token_tfidf)


In [None]:
newspaper_token_tfidf = sparse.load_npz("newspaper_token_tfidf.npz")