In [1]:
# Imports 
from glob import glob
import os
from joblib import Parallel, delayed

import numpy as np
import gensim
import nltk

from lib.WordVectors import language_filter
from settings import project_root

## Data Exploration and Analysis

In [2]:
# create a numpy array containing the absolute paths names of the input files.
rawfiles = os.path.join(project_root, 'tmp', 'test_files', '*')
file_array = np.array(glob(rawfiles))
print('Number of files: {}'.format(file_array.shape[0]))

Number of files: 1506602


In [3]:
# Print out a few sample texts for verification.
samples = np.random.choice(file_array, 10, replace=False)
for sam in samples:
    print (open (sam, 'r').read())

Cindy Vortex was you're ordinary eleven- year- old girl. She loved to talk on the phone, shop and hang out with her friend, Libby Folfax. But today she was on her bed crying her eyes out and the weather had fit her mood perfectly. It was raining. Hard. The weather man on the radio was saying that this was the most rain that Retroville got in fifty years. Cindy turned off the radio and laid out on her bed."Why did I have to be so stupid?" she asked herself and cried more on her pillow.You see, today was her birthday and... You know why don't I just let Cindy tell you about her day through flashback.Cindy woke up happily as the alarm clock went off. Today was Sunday, her birthday, and she was happy because she knew what she was going to do today. She and her family do this every year. They were going to go Retroland and ride every ride there twice and pig out on lots of junk food.She hurriedly took a shower, brushed her teeth and got dressed. She looked at her self in a mirror, she frown

In [4]:
def mean_file_length():
    lengths = []
    samples = np.random.choice(file_array, 100, replace=False)
    for sam in samples:
        lengths.append(len(open (sam, 'r').read()))
    lengths = np.array(lengths)
    return lengths.mean()

In [5]:
mean_lens = Parallel(n_jobs=8)(delayed(mean_file_length)() for _ in range(100))
mean_lens = np.array(mean_lens)

In [6]:
print('Approximate Average file length in chars : {}'.format(mean_lens.mean()))

Approximate Average file length in chars : 10125.0372


## Text Pre-processing

### Language Filter

In [7]:
# Use stop words to identify the language as English and filter texts based on that.
eng_filter = Parallel(n_jobs=8)(delayed(language_filter.is_english_nltk)(open(doc).read()) for doc in file_array)
non_english = file_array[~np.array(eng_filter)]

In [8]:
print('Number of non-english files : {}'.format(non_english.shape[0]))

Number of non-english files : 120740


In [9]:
# verify if the texts are identified as non-english correctly
non_samples = np.random.choice(non_english, 10, replace=False)
for n_sam in non_samples:
    print (open(n_sam, 'r').read())

Bueno, aquí os traigo el tercer capítulo, siento haber tardado un par de días mas, lo siento ! Aquí os dejo con el capítulo, que lo disfrutéis!

Capítulo 3. Antiguas amistades.Qué sorpresa el encontrarme a Rukawa allí, bueno más que una sorpresa era un decepción, no es que me cayera bien precisamente. He de reconocer que juega bien, pero es tan borde... Me senté al lado de unos arbustos a ver cómo entrenaba, no tenía otra cosa mejor que hacer.-¿Qué miras?-Miro como entrenas, pasaba por aquí y no tengo nada que hacer.-Hmp, vale, pero no molestes.-Descuida, no te molestaré-digo con sarcasmo.-Tu cara me suena. Eres la manager del Ryonan.-¿No me digas?-Oye tú, ve a llamar a Sendoh donde quiera que esté. Quiero jugar un uno contra uno con él.-Tengo nombre, me llamo Ayano, y no tengo porque hacerlo, no soy tu sirvienta.-Tsk...no te alteres.¡Kaede Rukawa... que borde que es! No sé como tiene un club de fans, no en serio no lo entiendo, si fuera como Sendoh vale, pero siendo tan borde, egocént

In [10]:
# Iteratively remove non_english texts from source
for item in non_english:
    os.remove(item)

In [11]:
# File list after filtering out non-english texts
file_array = np.array(glob(rawfiles))
print('Number of files left after filtering out the non-english-texts: {}'.format(file_array.shape[0]))

Number of files left after filtering out the non-english-texts: 1385862


In [12]:
# Print out a few sample texts for verification.
samples = np.random.choice(file_array, 10, replace=False)
for sam in samples:
    print (open (sam, 'r').read())

"What?" Daria was dumbfounded."I said, Daria, that you didn't make the team," Brittany repeated."Wh-why the hell not?" Daria stammered out.Brittany curled a lock of her hair around one finger. "Well, you don't have that special bouncity-bounce that a cheerleader should have, you know?""'Bouncity-bounce?'""And you're not very well coordinated, either, Daria...last year, when I was a freshman, one of the girls on the team lost her balance and fell. It was so scary!" Brittany trembled at the mere thought of it. "She broke one of her legs, and cracked a verte-thingy in her back. If she hit just a little harder, she'd be in a wheelchair!" Brittany was now openly crying, embarrassing Daria."Um...there there," Daria lamely tried to comfort Brittany by patting her on the shoulder."You don't hate me, do you, Daria? I know you really wanted to be a cheerleader, but please don't hate me!""I...of course I don't hate you, Brittany," Daria said, put off by the whole conversation. "And, well, it's no

In [13]:
sample = open(samples[0],'r').read()

### Tokenizing Text

In [14]:
%%timeit
tokened_sample_1 = gensim.utils.tokenize(sample,lowercase=False, deacc=False, errors='strict')

The slowest run took 14.20 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 3: 571 ns per loop


In [15]:
tokened_sample_1 = gensim.utils.tokenize(sample,lowercase=False, deacc=False, errors='strict')
print([token for token in tokened_sample_1])

[u'What', u'Daria', u'was', u'dumbfounded', u'I', u'said', u'Daria', u'that', u'you', u'didn', u't', u'make', u'the', u'team', u'Brittany', u'repeated', u'Wh', u'why', u'the', u'hell', u'not', u'Daria', u'stammered', u'out', u'Brittany', u'curled', u'a', u'lock', u'of', u'her', u'hair', u'around', u'one', u'finger', u'Well', u'you', u'don', u't', u'have', u'that', u'special', u'bouncity', u'bounce', u'that', u'a', u'cheerleader', u'should', u'have', u'you', u'know', u'Bouncity', u'bounce', u'And', u'you', u're', u'not', u'very', u'well', u'coordinated', u'either', u'Daria', u'last', u'year', u'when', u'I', u'was', u'a', u'freshman', u'one', u'of', u'the', u'girls', u'on', u'the', u'team', u'lost', u'her', u'balance', u'and', u'fell', u'It', u'was', u'so', u'scary', u'Brittany', u'trembled', u'at', u'the', u'mere', u'thought', u'of', u'it', u'She', u'broke', u'one', u'of', u'her', u'legs', u'and', u'cracked', u'a', u'verte', u'thingy', u'in', u'her', u'back', u'If', u'she', u'hit', u'ju

In [16]:
%%timeit
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')

1000000 loops, best of 3: 552 ns per loop


In [17]:
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
print([token for token in tokened_sample_2])

[u'what', u'daria', u'was', u'dumbfounded', u'i', u'said', u'daria', u'that', u'you', u'didn', u't', u'make', u'the', u'team', u'brittany', u'repeated', u'wh', u'why', u'the', u'hell', u'not', u'daria', u'stammered', u'out', u'brittany', u'curled', u'a', u'lock', u'of', u'her', u'hair', u'around', u'one', u'finger', u'well', u'you', u'don', u't', u'have', u'that', u'special', u'bouncity', u'bounce', u'that', u'a', u'cheerleader', u'should', u'have', u'you', u'know', u'bouncity', u'bounce', u'and', u'you', u're', u'not', u'very', u'well', u'coordinated', u'either', u'daria', u'last', u'year', u'when', u'i', u'was', u'a', u'freshman', u'one', u'of', u'the', u'girls', u'on', u'the', u'team', u'lost', u'her', u'balance', u'and', u'fell', u'it', u'was', u'so', u'scary', u'brittany', u'trembled', u'at', u'the', u'mere', u'thought', u'of', u'it', u'she', u'broke', u'one', u'of', u'her', u'legs', u'and', u'cracked', u'a', u'verte', u'thingy', u'in', u'her', u'back', u'if', u'she', u'hit', u'ju

### Stemming

In [18]:
%%timeit
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
stemmer = gensim.parsing.porter.PorterStemmer()
stemmed_sample = (stemmer.stem(token) for token in tokened_sample_2)

1000000 loops, best of 3: 1.59 µs per loop


In [20]:
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
stemmer = gensim.parsing.porter.PorterStemmer()
stemmed_sample = (stemmer.stem(token) for token in tokened_sample_2)
print([st for st in stemmed_sample])

[u'what', u'daria', u'wa', u'dumbfound', u'i', u'said', u'daria', u'that', u'you', u'didn', u't', u'make', u'the', u'team', u'brittani', u'repeat', u'wh', u'why', u'the', u'hell', u'not', u'daria', u'stammer', u'out', u'brittani', u'curl', u'a', u'lock', u'of', u'her', u'hair', u'around', u'on', u'finger', u'well', u'you', u'don', u't', u'have', u'that', u'special', u'bounciti', u'bounc', u'that', u'a', u'cheerlead', u'should', u'have', u'you', u'know', u'bounciti', u'bounc', u'and', u'you', u're', u'not', u'veri', u'well', u'coordin', u'either', u'daria', u'last', u'year', u'when', u'i', u'wa', u'a', u'freshman', u'on', u'of', u'the', u'girl', u'on', u'the', u'team', u'lost', u'her', u'balanc', u'and', u'fell', u'it', u'wa', u'so', u'scari', u'brittani', u'trembl', u'at', u'the', u'mere', u'thought', u'of', u'it', u'she', u'broke', u'on', u'of', u'her', u'leg', u'and', u'crack', u'a', u'vert', u'thingi', u'in', u'her', u'back', u'if', u'she', u'hit', u'just', u'a', u'littl', u'harder'

### Lemmatization

In [21]:
%%timeit
lemmatized = gensim.utils.lemmatize(sample)

1 loop, best of 3: 161 ms per loop


In [22]:
lemmatized = gensim.utils.lemmatize(sample)
print(lemmatized)

['daria/NN', 'be/VB', 'dumbfounded/JJ', 'say/VB', 'darium/NN', 'didn/VB', 'make/VB', 'team/NN', 'brittany/NN', 'repeat/VB', 'wh/VB', 'hell/NN', 'not/RB', 'darium/NN', 'stammer/VB', 'out/RB', 'brittany/NN', 'curl/VB', 'lock/NN', 'hair/NN', 'finger/NN', 'well/RB', 'don/VB', 'have/VB', 'special/JJ', 'bouncity/NN', 'cheerleader/NN', 'have/VB', 'know/VB', 'bouncity/NN', 'bounce/NN', 're/NN', 'not/RB', 'very/RB', 'well/RB', 'coordinate/VB', 'darium/NN', 'last/JJ', 'year/NN', 'be/VB', 'freshman/NN', 'girl/NN', 'team/NN', 'lose/VB', 'balance/NN', 'fall/VB', 'be/VB', 'so/RB', 'scary/JJ', 'brittany/NN', 'tremble/VB', 'mere/JJ', 'thought/NN', 'break/VB', 'leg/NN', 'crack/VB', 'verte/NN', 'thingy/NN', 'back/RB', 'hit/VB', 'just/RB', 'little/JJ', 'harder/NN', 'be/VB', 'wheelchair/NN', 'brittany/NN', 'be/VB', 'now/RB', 'openly/RB', 'cry/VB', 'embarrassing/NN', 'there/RB', 'darium/NN', 'lamely/RB', 'try/VB', 'comfort/NN', 'brittany/NN', 'pat/VB', 'shoulder/NN', 'don/VB', 'hate/VB', 'do/VB', 'daria/VB

### Stop word Removal

In [23]:
stop_words = set(nltk.corpus.stopwords.words('english'))
print (stop_words)

set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'more', u'himself', u'that', u'but', u'don', u'with', u'than', u'those', u'he', u'me', u'myself', u'ma', u'these', u'up', u'will', u'below', u'ain

In [24]:
%%timeit
stop_words = set(nltk.corpus.stopwords.words('english'))
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
non_stops = [token for token in tokened_sample_2 if token not in stop_words]
stemmed_sample = [stemmer.stem(token) for token in non_stops]

100 loops, best of 3: 9.99 ms per loop


In [25]:
stop_words = set(nltk.corpus.stopwords.words('english'))
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
non_stops = [token for token in tokened_sample_2 if token not in stop_words]
stemmed_sample = [stemmer.stem(token) for token in non_stops]
print[stemmed_sample]

[[u'daria', u'dumbfound', u'said', u'daria', u'make', u'team', u'brittani', u'repeat', u'wh', u'hell', u'daria', u'stammer', u'brittani', u'curl', u'lock', u'hair', u'around', u'on', u'finger', u'well', u'special', u'bounciti', u'bounc', u'cheerlead', u'know', u'bounciti', u'bounc', u'well', u'coordin', u'either', u'daria', u'last', u'year', u'freshman', u'on', u'girl', u'team', u'lost', u'balanc', u'fell', u'scari', u'brittani', u'trembl', u'mere', u'thought', u'broke', u'on', u'leg', u'crack', u'vert', u'thingi', u'back', u'hit', u'littl', u'harder', u'wheelchair', u'brittani', u'openli', u'cry', u'embarrass', u'daria', u'um', u'daria', u'lame', u'tri', u'comfort', u'brittani', u'pat', u'shoulder', u'hate', u'daria', u'know', u'realli', u'want', u'cheerlead', u'pleas', u'hate', u'cours', u'hate', u'brittani', u'daria', u'said', u'put', u'whole', u'convers', u'well', u'much', u'want', u'cheerlead', u'ms', u'li', u'want', u'take', u'extracirricular', u'would', u'let', u'show', u'face',

### Gensim Pre-Processing

In [26]:
%%timeit
gensim.parsing.preprocess_string(sample)

100 loops, best of 3: 7.89 ms per loop


In [27]:
preprocessed = gensim.parsing.preprocess_string(sample)
print(preprocessed)

[u'daria', u'dumbfound', u'said', u'daria', u'team', u'brittani', u'repeat', u'hell', u'daria', u'stammer', u'brittani', u'curl', u'lock', u'hair', u'finger', u'special', u'bounciti', u'bounc', u'cheerlead', u'know', u'bounciti', u'bounc', u'coordin', u'daria', u'year', u'freshman', u'girl', u'team', u'lost', u'balanc', u'fell', u'scari', u'brittani', u'trembl', u'mere', u'thought', u'broke', u'leg', u'crack', u'vert', u'thingi', u'hit', u'littl', u'harder', u'wheelchair', u'brittani', u'openli', u'cry', u'embarrass', u'daria', u'daria', u'lame', u'tri', u'comfort', u'brittani', u'pat', u'shoulder', u'hate', u'daria', u'know', u'want', u'cheerlead', u'hate', u'cours', u'hate', u'brittani', u'daria', u'said', u'convers', u'want', u'cheerlead', u'want', u'extracirricular', u'let', u'face', u'world', u'brittani', u'said', u'tear', u'final', u'run', u'dry', u'impuls', u'lean', u'forward', u'hug', u'daria', u'recoil', u'touch', u'hesit', u'arm', u'brittani', u'gentli', u'squeez', u'worri', 