# Sprint 20

## Natural Language Processing

### Preparation

In [1]:
from sklearn.datasets import load_files

train_review = load_files('./aclImdb/train/', encoding='utf-8')
x_train, y_train = train_review.data, train_review.target

test_review = load_files('./aclImdb/test/', encoding='utf-8')
x_test, y_test = test_review.data, test_review.target

# Display of the correspondence between 0, 1 of the label and the meaning
print(train_review.target_names)

['neg', 'pos', 'unsup']


In [2]:
print("x : {}".format(x_train[0]))

x : Full of (then) unknown actors TSF is a great big cuddly romp of a film.<br /><br />The idea of a bunch of bored teenagers ripping off the local sink factory is odd enough, but add in the black humour that Forsyth & Co are so good at and your in for a real treat.<br /><br />The comatose van driver by itself worth seeing, and the canal side chase is just too real to be anything but funny.<br /><br />And for anyone who lived in Glasgow it's a great "Oh I know where that is" film.


### [Problem 1] Scratch implementation of BoW

In [18]:
import re
from collections import Counter
import pandas as pd
import functools

In [4]:
mini_dataset = \
  ['This movie is SOOOO funny!!!',
  'What a movie! I never',
  'best movie ever!!!!! this movie']

In [37]:
def ngram_extractor(documents, n_grams):
    docs_filtered = [re.sub('[^A-Za-z0-9\s]', '', doc.lower()) for doc in documents]
    docs_splited = [doc.split() for doc in docs_filtered]
    if n_grams == 1:
        return docs_splited
    else:
        return [
            [" ".join(doc_splited[i:i+n_grams]) for i in range(len(doc_splited) - n_grams + 1)]
            for doc_splited in docs_splited
        ]

def count_vectorizer(doc, all_words):
        d = dict()
        for word in all_words:
            d[word] = 0
        for word in doc:
            d[word] += 1
        return d

def bag_of_words(documents, n_grams=1):
    docs_splited = ngram_extractor(documents, n_grams)
    all_words = functools.reduce(lambda a, b: set(a).union(set(b)), docs_splited)
    count_matrix = [count_vectorizer(doc, all_words) for doc in docs_splited]
    count_df = pd.DataFrame(count_matrix)
    return count_df

In [38]:
bag_of_words(mini_dataset, n_grams=1)

Unnamed: 0,i,never,movie,soooo,best,is,ever,funny,a,this,what
0,0,0,1,1,0,1,0,1,0,1,0
1,1,1,1,0,0,0,0,0,1,0,1
2,0,0,2,0,1,0,1,0,0,1,0


In [39]:
bag_of_words(mini_dataset, n_grams=2)

Unnamed: 0,a movie,this movie,best movie,what a,movie ever,i never,movie i,is soooo,ever this,soooo funny,movie is
0,0,1,0,0,0,0,0,1,0,1,1
1,1,0,0,1,0,1,1,0,0,0,0
2,0,1,1,0,1,0,0,0,1,0,0


### [Problem 2] TF-IDF calculation

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = (vect.fit_transform(x_train))
vec = vect.get_feature_names()

vectorizer_test = TfidfVectorizer(stop_words='english',
                                  max_features=5000,
                                  vocabulary=vec)
X_test = vectorizer_test.fit_transform(x_test)
print(X_train.shape, X_test.shape)



(75000, 5000) (25000, 5000)


### [Problem 3] Learning using TF-IDF

In [54]:
X_train_binary = X_train[y_train != 2]
y_train_binary = y_train[y_train != 2]
X_train_binary, y_train_binary

(<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
 	with 1771306 stored elements in Compressed Sparse Row format>,
 array([1, 1, 0, ..., 1, 1, 1]))

In [57]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

model = LinearSVC()
model.fit(X_train_binary, y_train_binary)
y_pred = model.predict(X_test)
print(y_pred)

[1 0 1 ... 0 0 0]


In [106]:
y_test

array([1, 0, 1, ..., 0, 0, 0])

In [58]:
accuracy_score(y_test, y_pred)

0.86048

### [Problem 4] Scratch mounting of TF-IDF

In [88]:
import numpy as np

def tf_idf(documents, n_grams=1, applied_type='standard'):
    count_matrix = bag_of_words(documents, n_grams)
    tf = count_matrix.copy()
    idf = count_matrix.copy()
    if applied_type == 'standard':
        for i in range(len(tf)):
            tf.iloc[i, :] = tf.iloc[i, :] / tf.iloc[i, :].sum()
        for t in idf.columns:
            idf[t][:] = np.log(len(idf.columns) / idf[t].sum())
    elif applied_type == 'sklearn':
        for t in idf.columns:
            idf[t][:] = np.log((1 + len(idf.columns)) / (1 + idf[t].sum())) + 1
    else:
        raise ValueError()
    return tf * idf

In [89]:
tf_idf(mini_dataset, applied_type='standard')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,i,never,movie,soooo,best,is,ever,funny,a,this,what
0,0.0,0.0,0.20232,0.479579,0.0,0.479579,0.0,0.479579,0.0,0.34095,0.0
1,0.479579,0.479579,0.20232,0.0,0.0,0.0,0.0,0.0,0.479579,0.0,0.479579
2,0.0,0.0,0.40464,0.0,0.479579,0.0,0.479579,0.0,0.0,0.34095,0.0


In [90]:
tf_idf(mini_dataset, applied_type='sklearn')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,i,never,movie,soooo,best,is,ever,funny,a,this,what
0,0.0,0.0,1.875469,2.791759,0.0,2.791759,0.0,2.791759,0.0,2.386294,0.0
1,2.791759,2.791759,1.875469,0.0,0.0,0.0,0.0,0.0,2.791759,0.0,2.791759
2,0.0,0.0,3.750937,0.0,2.791759,0.0,2.791759,0.0,0.0,2.386294,0.0


### [Problem 5] Corpus pretreatment

In [94]:
URL_PATTERN = "^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$"

def preprocess(doc):
    doc = re.sub(URL_PATTERN, "", doc)
    doc = re.sub('[^A-Za-z0-9\s]', "", doc)
    doc = doc.lower().split()
    return doc

In [95]:
x_train_processed = [preprocess(doc) for doc in x_train]

In [96]:
x_train[0]

'Full of (then) unknown actors TSF is a great big cuddly romp of a film.<br /><br />The idea of a bunch of bored teenagers ripping off the local sink factory is odd enough, but add in the black humour that Forsyth & Co are so good at and your in for a real treat.<br /><br />The comatose van driver by itself worth seeing, and the canal side chase is just too real to be anything but funny.<br /><br />And for anyone who lived in Glasgow it\'s a great "Oh I know where that is" film.'

In [98]:
x_train_processed[0][:20]

['full',
 'of',
 'then',
 'unknown',
 'actors',
 'tsf',
 'is',
 'a',
 'great',
 'big',
 'cuddly',
 'romp',
 'of',
 'a',
 'filmbr',
 'br',
 'the',
 'idea',
 'of',
 'a']

### [Problem 6] Learning Word2Vec

In [99]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.1.2-cp37-cp37m-win_amd64.whl (24.0 MB)
Collecting Cython==0.29.23
  Downloading Cython-0.29.23-cp37-cp37m-win_amd64.whl (1.6 MB)
Collecting smart-open>=1.8.1
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Installing collected packages: smart-open, Cython, gensim
Successfully installed Cython-0.29.23 gensim-4.1.2 smart-open-5.2.1


In [105]:
from gensim.models import Word2Vec
model = Word2Vec(min_count=1, vector_size=10) 
model.build_vocab(x_train_processed) 
model.train(x_train_processed, total_examples=model.corpus_count, epochs=2)

(26503000, 34961872)