## Create TF-IDF matrix

In [3]:
import pandas as pd
import numpy as np
import ast

from Constants import *

from collections import Counter
from itertools import chain

from scipy import sparse
from scipy.sparse import csc_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

## Create newspaper set

In [8]:
df_speaker_newspaper = pd.read_csv(FILE_SPEAKER_NEWSPAPER, compression='bz2') 
newspapers = set(df_speaker_newspaper.newspaper)

## Create token vocabulary

In [None]:
csv_reader = pd.read_csv(FILE_NEWSPAPER_TOKEN, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

#Create a Counter of all tokens
vocabulary = Counter()

for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    # Count all tokens in the chunk (multiply a token by the number of newspaper quoting the quote)
    vocabulary = vocabulary +  Counter(chain.from_iterable(df_chunk.explode("newspapers")["tokens"]))

# Removing token that appears in only one newspaper
processed_voc = list(np.array(list(vocabulary.keys()))[np.array(list(vocabulary.values()))!=1])

## Create frequency matrix

In [None]:
csv_reader = pd.read_csv(FILE_NEWSPAPER_TOKEN, chunksize=10_000,compression='bz2', converters={"newspapers": ast.literal_eval,"tokens":ast.literal_eval}) 

newspapers = sorted(list(newspapers))
newspapers_to_index = {n:i for i, n in enumerate(newspapers)}
index_to_newspapers = {i:n for i, n in enumerate(newspapers)}
token_to_index = {n:i for i, n in enumerate(newspapers)}
index_to_token = {i:n for i, n in enumerate(newspapers)}

def dummy(doc):
    return doc
vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, vocabulary=processed_voc) 

# Create DataFrame
df = pd.DataFrame({'newspapers':newspapers})

for (counter, df_chunk) in enumerate(csv_reader):
    print(f"Chunk: {counter}")
    
    df_exploded = df_chunk.explode("newspapers")
    
    # Create dataframe with all the tokens per newspaper
    df_grouped = df_exploded.groupby("newspapers")["tokens"].apply(sum).reset_index() 
    
    # Join previous dataframe with a dumb dataframe containing all the newspaper as index
    # => Add empty newspaper, allow to create frequency matrix with the correct index for newspaper
    df_join = df.set_index('newspapers').join(df_grouped.set_index('newspapers'))
    df_join["tokens"] = np.where(df_join["tokens"].isna(), [""], df_join["tokens"])
    
    # Create token frequency vector by newspaper
    X = vectorizer.fit_transform(df_join["tokens"])
    
    # Sum all the token x newspaper frequency matrix
    if(counter == 0):
        newspaper_token_frequency = X
    else:
        newspaper_token_frequency += X

## Create TF-IDF matrix

In [None]:
transformer = TfidfTransformer()
newspaper_token_tfidf = transformer.fit_transform(newspaper_token_frequency)

## Save TF-IDF matrix

In [None]:
sparse.save_npz(FILE_NEWSPAPER_TOKEN_TFIDF, newspaper_token_tfidf)

## Example to load TF-IDF matrix

In [4]:
newspaper_token_tfidf = sparse.load_npz(FILE_NEWSPAPER_TOKEN_TFIDF)