In [1]:
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld

import string
import re

from collections import Counter


import nltk
from nltk.stem import WordNetLemmatizer

from nltk.corpus import stopwords

en_stopwords = set(stopwords.words("english"))
en_stopwords.update([s.capitalize() for s in stopwords.words("english")])

## Preprocess data

In [27]:
def extract_newspaper(url):
    url_pruned = urlparse(url).netloc
    tld = get_tld(url, as_object=True).tld
    url_no_tld = url_pruned.replace('.'+tld,"")
    domain = url_no_tld.split('.')[-1]

    return domain

In [28]:
def preprocess(quote, words_to_exclude = set()):
    
    # Lower first word of the sentence
    lower_first_word = lambda tab: " ".join([tab[0].lower()] + tab[1:])
    quote = " ".join([lower_first_word(sentence.split(" ")) for sentence in quote.split(".")])
    
    #Remove Numbers
    quote = re.sub(r'\d+', '', quote) 

    # Tokenize
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    word_tokens = tokenizer.tokenize(quote)
        
    # TODO Pos tag before lemmatizing ? https://stackoverflow.com/questions/32957895/wordnetlemmatizer-not-returning-the-right-lemma-unless-pos-is-explicit-python
    
    remove_stop_words = lambda wt: [w for w in wt if not w in en_stopwords and len(w) > 1]
    # Remove stop words and single letters
    word_tokens = remove_stop_words(word_tokens)

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
    
    # Remove stop words and single letters
    word_tokens = remove_stop_words(word_tokens)
    
    #bigram = [w1 + " " + w2 for w1,w2 in zip(filtered_sentence[:-1], filtered_sentence[1:])]
    #filtered_sentence += bigram
    #filtered_sentence = [w for w in filtered_sentence if w not in words_to_exclude]
        
    return word_tokens

In [29]:
preprocess("Antoine is going to eat 10 Apples, #Ç[]after is morning run.")

['antoine', 'going', 'eat', 'Apples', 'morning', 'run']

In [30]:
FILE_IN = 'quotes-2020.json.bz2' # 525 chunks de 10_000
FILE_SPEAKERS = "processed-speaker-2020_2.csv.bz2"

In [79]:
FILE_QUOTES = "nps-tokens-2020.csv.bz2"

json_reader = pd.read_json(FILE_IN,lines=True,chunksize=10_000,compression='bz2') 

PROBA_SPEAKER = 0.6


for (counter, df_chunk) in enumerate(json_reader):
    
    df_chunk["tokens"] = df_chunk["quotation"].apply(preprocess)
    #df_chunk = df_chunk.explode("urls")
    df_chunk["newspapers"] = df_chunk["urls"].apply(lambda ln: [extract_newspaper(n) for n in ln])
    
    #df_chunk["probas"] = df_chunk['probas'].apply(lambda probas: float(probas[0][1]))
        
    print(f"Chunk: {counter}")
    
    #df_speakers = df_chunk[df_chunk['probas'] > PROBA_SPEAKER & df_chunk['speaker'] != "None"][["newspaper", "speaker"]]
    df_quotes = df_chunk[["newspapers", "tokens"]]

    add_header = (counter==0)
    write_mode = 'w' if counter == 0 else 'a'
    #df_speakers.to_csv(FILE_SPEAKERS,header = add_header,index=False, mode=write_mode,compression='bz2')
    #df_quotes.to_csv(FILE_QUOTES,header = add_header,index=False, mode=write_mode,compression='bz2')

Chunk: 0
Chunk: 1
Chunk: 2
Chunk: 3
Chunk: 4
Chunk: 5
Chunk: 6
Chunk: 7
Chunk: 8
Chunk: 9
Chunk: 10
Chunk: 11
Chunk: 12
Chunk: 13
Chunk: 14
Chunk: 15
Chunk: 16
Chunk: 17
Chunk: 18
Chunk: 19
Chunk: 20
Chunk: 21
Chunk: 22
Chunk: 23
Chunk: 24
Chunk: 25
Chunk: 26
Chunk: 27
Chunk: 28
Chunk: 29
Chunk: 30
Chunk: 31
Chunk: 32
Chunk: 33
Chunk: 34
Chunk: 35
Chunk: 36
Chunk: 37
Chunk: 38
Chunk: 39
Chunk: 40
Chunk: 41
Chunk: 42
Chunk: 43
Chunk: 44
Chunk: 45
Chunk: 46
Chunk: 47
Chunk: 48
Chunk: 49
Chunk: 50
Chunk: 51
Chunk: 52
Chunk: 53
Chunk: 54
Chunk: 55
Chunk: 56
Chunk: 57
Chunk: 58
Chunk: 59
Chunk: 60
Chunk: 61
Chunk: 62
Chunk: 63
Chunk: 64
Chunk: 65
Chunk: 66
Chunk: 67
Chunk: 68
Chunk: 69
Chunk: 70
Chunk: 71
Chunk: 72
Chunk: 73
Chunk: 74
Chunk: 75
Chunk: 76
Chunk: 77
Chunk: 78
Chunk: 79
Chunk: 80
Chunk: 81
Chunk: 82
Chunk: 83
Chunk: 84
Chunk: 85
Chunk: 86
Chunk: 87
Chunk: 88
Chunk: 89
Chunk: 90
Chunk: 91
Chunk: 92
Chunk: 93
Chunk: 94
Chunk: 95
Chunk: 96
Chunk: 97
Chunk: 98
Chunk: 99
Chunk: 100