In [1]:
# Imports 
from glob import glob
import os
from joblib import Parallel, delayed

import numpy as np
import gensim
import nltk

from lib.WordVectors import language_filter
from settings import project_root

## Data Exploration and Analysis

In [2]:
# create a numpy array containing the absolute paths names of the input files.
rawfiles = os.path.join(project_root, 'tmp', 'test_files', '*')
file_array = np.array(glob(rawfiles))
print('Number of files: {}'.format(file_array.shape[0]))

Number of files: 1506602


In [3]:
# Print out a few sample texts for verification.
samples = np.random.choice(file_array, 10, replace=False)
for sam in samples:
    print (open (sam, 'r').read())

Cindy Vortex was you're ordinary eleven- year- old girl. She loved to talk on the phone, shop and hang out with her friend, Libby Folfax. But today she was on her bed crying her eyes out and the weather had fit her mood perfectly. It was raining. Hard. The weather man on the radio was saying that this was the most rain that Retroville got in fifty years. Cindy turned off the radio and laid out on her bed."Why did I have to be so stupid?" she asked herself and cried more on her pillow.You see, today was her birthday and... You know why don't I just let Cindy tell you about her day through flashback.Cindy woke up happily as the alarm clock went off. Today was Sunday, her birthday, and she was happy because she knew what she was going to do today. She and her family do this every year. They were going to go Retroland and ride every ride there twice and pig out on lots of junk food.She hurriedly took a shower, brushed her teeth and got dressed. She looked at her self in a mirror, she frown

In [4]:
def mean_file_length():
    lengths = []
    samples = np.random.choice(file_array, 100, replace=False)
    for sam in samples:
        lengths.append(len(open (sam, 'r').read()))
    lengths = np.array(lengths)
    return lengths.mean()

In [5]:
mean_lens = Parallel(n_jobs=8)(delayed(mean_file_length)() for _ in range(100))
mean_lens = np.array(mean_lens)

In [None]:
print('Approximate Average file length in chars : {}'.format(mean_lens.mean()))

Approximate Average file length in chars : 10125.0372


## Text Pre-processing

### Language Filter

In [None]:
# Use stop words to identify the language as English and filter texts based on that.
eng_filter = Parallel(n_jobs=8)(delayed(language_filter.is_english_nltk)(open(doc).read()) for doc in file_array)
non_english = file_array[~np.array(eng_filter)]

In [None]:
print('Number of non-english files : {}'.format(non_english.shape[0]))

In [None]:
# verify if the texts are identified as non-english correctly
non_samples = np.random.choice(non_english, 10, replace=False)
for n_sam in non_samples:
    print (open(n_sam, 'r').read())

In [None]:
# Iteratively remove non_english texts from source
for item in non_english:
    os.remove(item)

In [None]:
# File list after filtering out non-english texts
file_array = np.array(glob(rawfiles))
print('Number of files left after filtering out the non-english-texts: {}'.format(file_array.shape[0]))

In [None]:
# Print out a few sample texts for verification.
samples = np.random.choice(file_array, 10, replace=False)
for sam in samples:
    print (open (sam, 'r').read())

In [None]:
sample = open(samples[0],'r').read()

### Tokenizing Text

In [None]:
%%timeit
tokened_sample_1 = gensim.utils.tokenize(sample,lowercase=False, deacc=False, errors='strict')

In [None]:
tokened_sample_1 = gensim.utils.tokenize(sample,lowercase=False, deacc=False, errors='strict')
print([token for token in tokened_sample_1])

In [None]:
%%timeit
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')

In [None]:
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
print([token for token in tokened_sample_2])

### Stemming

In [None]:
%%timeit
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
stemmer = gensim.parsing.porter.PorterStemmer()
stemmed_sample = (stemmer.stem(token) for token in tokened_sample_2)

In [None]:
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
stemmed_sample = (stemmer.stem(token) for token in tokened_sample_2)
print([st for st in stemmed_sample])

### Lemmatization

In [None]:
%%timeit
lemmatized = gensim.utils.lemmatize(sample)

In [None]:
lemmatized = gensim.utils.lemmatize(sample)
print(lemmatized)

### Stop word Removal

In [None]:
stop_words = set(nltk.corpus.stopwords.words('english'))
print (stop_words)

In [None]:
%%timeit
stop_words = set(nltk.corpus.stopwords.words('english'))
tokened_sample_2 = gensim.utils.tokenize(sample,lowercase=True, deacc=True, errors='strict')
non_stops = [token for token in tokened_sample_2 if token not in stop_words]
stemmed_sample = [stemmer.stem(token) for token in non_stops]

In [None]:
print[stemmed_sample]

### Gensim Pre-Processing

In [None]:
%%timeit
gensim.parsing.preprocess_string(sample)

In [None]:
preprocessed = gensim.parsing.preprocess_string(sample)
print(preprocessed)