# Data preprocessing Jules-Verne Project

In [31]:
import nltk 
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import re
import pprint

### Get raw data & remove heading

In [32]:
# ------------------------------
# Function tokenizer_punctuation
# ------------------------------

def get_and_split(data_path, limit_str):
    # open file
    with open(data_path, "r") as file:
        contents = file.read()
    # split contents string & return 2nd element
    data_content = contents.split(limit_str)
    return(data_content[1])

In [33]:
data_ballon = get_and_split('data/01_ballon.txt', 'DEBUT DU FICHIER ballon1 --------------------------------')
data_begum = get_and_split('data/02_begum.txt', 'DEBUT DU FICHIER begum2 --------------------------------')
data_blocus = get_and_split('data/03_blocus.txt', 'DEBUT DU FICHIER blocus3 --------------------------------')
data_bounty = get_and_split('data/04_bounty.txt', 'DEBUT DU FICHIER bounty1 --------------------------------')
data_robur = get_and_split('data/05_robur.txt', 'DEBUT DU FICHIER robur1 --------------------------------')
data_tdm80 = get_and_split('data/06_tdm80.txt', 'DEBUT DU FICHIER tdm80j2 --------------------------------')
data_terrelune = get_and_split('data/07_terrelune.txt', 'DEBUT DU FICHIER tlun3 --------------------------------')

In [44]:
# uncomment to check document
# print(data_ballon)
# print(data_begum)
# print(data_blocus)
# print(data_bounty)
# print(data_robur)
# print(data_tdm80)
# print(data_terrelune)

### Tokenizer

In [35]:
# ------------------------------
# Function tokenizer_punctuation
# ------------------------------

def tokenizer_punctuation(sample_text):
    # tokenizer definition
    tokenizer = nltk.RegexpTokenizer(r'\w+')
    # return text without punctuation
    return tokenizer.tokenize(sample_text)

In [50]:
tokens_ballon = tokenizer_punctuation(data_ballon.lower())
tokens_begum = tokenizer_punctuation(data_begum.lower())
tokens_blocus = tokenizer_punctuation(data_blocus.lower())
tokens_bounty = tokenizer_punctuation(data_bounty.lower())
tokens_robur = tokenizer_punctuation(data_robur.lower())
tokens_tdm80 = tokenizer_punctuation(data_tdm80.lower())
tokens_terrelune = tokenizer_punctuation(data_terrelune.lower())

In [51]:
# uncomment to check document
print(tokens_ballon[:10])
# print(tokens_begum[:10])
# print(tokens_blocus[:10])
# print(tokens_bounty[:10])
# print(tokens_robur[:10])
# print(tokens_tdm80[:10])
# print(tokens_terrelune[:10])

['jules', 'verne', 'cinq', 'semaines', 'en', 'ballon', 'voyage', 'de', 'découvertes', 'en']


In [37]:
french_stopwords = set(stopwords.words('french'))
filtre_stopfr =  lambda text: [token for token in text if token.lower() not in french_stopwords]

In [54]:
swfree_ballon = filtre_stopfr(tokens_ballon)
swfree_begum = filtre_stopfr(tokens_begum)
swfree_blocus = filtre_stopfr(tokens_blocus)
swfree_bounty = filtre_stopfr(tokens_bounty)
swfree_robur = filtre_stopfr(tokens_robur)
swfree_tdm80 = filtre_stopfr(tokens_tdm80)
swfree_terrelune = filtre_stopfr(tokens_terrelune)

### Words Frequencies

In [57]:
freg_ballon = nltk.FreqDist(swfree_ballon) 
freg_begum = nltk.FreqDist(swfree_begum) 
freg_blocus = nltk.FreqDist(swfree_blocus) 
freg_bounty = nltk.FreqDist(swfree_bounty) 
freg_robur = nltk.FreqDist(swfree_robur) 
freg_tdm80 = nltk.FreqDist(swfree_tdm80) 
freg_terrelune = nltk.FreqDist(swfree_terrelune) 

In [58]:
# uncomment to check
# pprint.pprint(freg_ballon.most_common(10))
# pprint.pprint(freg_begum.most_common(10))
# pprint.pprint(freg_blocus.most_common(10))
# pprint.pprint(freg_bounty.most_common(10))
# pprint.pprint(freg_robur.most_common(10))
# pprint.pprint(freg_tdm80.most_common(10))
pprint.pprint(freg_terrelune.most_common(10))

[('plus', 294),
 ('barbicane', 250),
 ('cette', 228),
 ('a', 218),
 ('mille', 198),
 ('lune', 187),
 ('cent', 170),
 ('deux', 165),
 ('si', 147),
 ('maston', 139)]
