<a href="https://colab.research.google.com/github/davidclizbe/datascience/blob/master/ClizbeFeatureEngineering3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Train your own word2vec representations as we did in our first example in the checkpoint. But, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [None]:
pip install gensim



In [None]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings
warnings.filterwarnings("ignore")

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
# utility function for standard text cleaning
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [None]:
# load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# the chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [None]:
# parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [None]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [None]:
# get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]

AttributeError: ignored

In [None]:
# train word2vec on the the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [None]:
print(model.most_similar(positive=['lady', 'man'], negative=['woman'], topn=5))
print(model.doesnt_match("dad dinner mom aunt uncle".split()))
print(model.similarity('woman', 'man'))
print(model.similarity('horse', 'cat'))

[('head', 0.9982106685638428), ('sort', 0.9980904459953308), ('meet', 0.9976276755332947), ('aunt', 0.997494637966156), ('handsome', 0.9972043633460999)]
dinner
0.99766195
0.90391624


In [None]:
word2vec_arr = np.zeros((sentences.shape[0],100))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

sentences.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,Carroll,"[Alice, begin, tired, sit, sister, bank, have,...",0.156557,0.504342,-0.014885,0.31283,0.135841,0.052049,-0.431068,0.028523,0.272996,0.282735,-0.37808,0.067731,-0.031333,-0.053178,-0.099557,-0.107361,-0.398297,0.211715,0.36761,0.073099,-0.284089,0.439602,-0.084738,0.176117,0.003873,-0.007618,-0.475505,0.095376,-0.110146,-0.442957,-0.297493,0.045424,0.030883,0.215378,-0.294835,-0.11627,0.095659,-0.417053,...,0.507816,-0.240877,0.608371,-0.363942,0.272643,-0.150568,0.148696,0.008605,0.048195,-0.160269,0.475473,0.109559,0.313951,-0.166453,-0.134322,-0.15902,0.284053,0.159858,-0.301681,-0.053141,0.371239,-0.263162,-0.173714,0.017265,0.316261,-0.142782,-0.111847,-0.524318,-0.032784,0.370151,0.134086,0.125074,0.032564,0.191429,-0.062701,0.006702,-0.090056,0.283728,-0.000987,0.210049
1,Carroll,"[consider, mind, hot, day, feel, sleepy, stupi...",0.116939,0.401289,0.008777,0.2621,0.137854,0.043824,-0.355057,0.034746,0.207341,0.213598,-0.324004,0.044033,-0.039625,-0.060718,-0.087796,-0.068523,-0.322759,0.183491,0.29984,0.05252,-0.243587,0.374562,-0.066973,0.161563,0.025756,-0.027119,-0.354464,0.085367,-0.093945,-0.35864,-0.239173,0.050123,0.039068,0.164698,-0.233838,-0.093474,0.063087,-0.331597,...,0.422756,-0.197529,0.478835,-0.287188,0.205328,-0.121597,0.126237,-0.003829,0.03256,-0.122114,0.40367,0.089553,0.273978,-0.104055,-0.095724,-0.144091,0.234705,0.116129,-0.240797,-0.034599,0.301703,-0.231728,-0.144601,0.023942,0.259804,-0.134262,-0.073277,-0.417739,-0.039294,0.30503,0.10721,0.119562,0.018667,0.169521,-0.040365,0.002672,-0.089513,0.224778,0.009853,0.154534
2,Carroll,"[remarkable, Alice, think, way, hear, Rabbit, ...",0.167634,0.576445,-0.023443,0.350662,0.156439,0.046915,-0.511667,0.015956,0.297355,0.32501,-0.433335,0.081311,-0.05128,-0.058653,-0.120802,-0.133634,-0.473012,0.226878,0.415087,0.063371,-0.317654,0.513487,-0.121112,0.207148,0.011231,-0.004099,-0.545946,0.111926,-0.133873,-0.509538,-0.349998,0.055461,0.052263,0.24919,-0.344722,-0.113718,0.115739,-0.490992,...,0.609013,-0.2722,0.697808,-0.431249,0.324589,-0.165976,0.177741,0.002685,0.063447,-0.193253,0.559225,0.154242,0.357273,-0.176755,-0.141429,-0.18765,0.319641,0.187974,-0.345561,-0.057221,0.437245,-0.30359,-0.212949,0.025946,0.366633,-0.166108,-0.135306,-0.611266,-0.018768,0.437321,0.136005,0.135738,0.033673,0.192804,-0.053868,0.004824,-0.103954,0.334612,0.01602,0.255659
3,Carroll,"[oh, dear]",0.117552,0.453083,-0.012553,0.288845,0.145011,0.037289,-0.414303,0.021134,0.216287,0.246997,-0.351296,0.038186,-0.026018,-0.058227,-0.084984,-0.095092,-0.370731,0.177488,0.337987,0.080276,-0.228769,0.39213,-0.075595,0.185601,0.022833,-0.019594,-0.415328,0.114959,-0.093607,-0.400744,-0.266566,0.041614,0.060701,0.230814,-0.299062,-0.11834,0.096874,-0.422802,...,0.537509,-0.253867,0.620693,-0.392961,0.276104,-0.159166,0.176203,-0.013564,0.041083,-0.170205,0.52704,0.14876,0.335442,-0.142431,-0.126843,-0.188578,0.297223,0.149271,-0.295473,-0.036655,0.38908,-0.299601,-0.207005,0.03314,0.315626,-0.152214,-0.097407,-0.526059,-0.04396,0.388996,0.134394,0.144853,0.039601,0.194676,-0.072663,-0.015563,-0.118429,0.273512,0.019187,0.218594
4,Carroll,"[shall, late]",0.132013,0.389604,-0.01497,0.223953,0.091411,0.036065,-0.31899,0.005912,0.221112,0.2272,-0.282236,0.065079,-0.031074,-0.027161,-0.079436,-0.089805,-0.303489,0.160611,0.281339,0.032045,-0.227189,0.342244,-0.069774,0.126075,-0.023212,0.014454,-0.387502,0.057436,-0.088997,-0.336785,-0.240506,0.031752,0.001222,0.15688,-0.220549,-0.083942,0.072426,-0.320512,...,0.37972,-0.172435,0.444954,-0.267936,0.219412,-0.106071,0.097951,0.015859,0.042354,-0.122185,0.336965,0.073037,0.22974,-0.135299,-0.095866,-0.113266,0.202954,0.134579,-0.235676,-0.044369,0.281121,-0.177154,-0.113152,0.003199,0.237271,-0.101208,-0.114352,-0.401953,-0.013157,0.273492,0.097854,0.076843,0.018238,0.122903,-0.038498,0.014136,-0.041507,0.224709,0.005873,0.165873


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

----------------------Logistic Regression Scores----------------------
Training set score: 0.750076103500761

Test set score: 0.7611872146118721
----------------------Random Forest Scores----------------------
Training set score: 0.9917808219178083

Test set score: 0.8068493150684931
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8864535768645357

Test set score: 0.8027397260273973


In [None]:
# Load Google's pre-trained Word2Vec model.
model_pretrained = gensim.models.KeyedVectors.load_word2vec_format(
    'https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
word2vec_arr = np.zeros((sentences.shape[0],300))

for i, sentence in enumerate(sentences["text"]):
  try:
    word2vec_arr[i,:] = np.mean([model_pretrained[lemma] for lemma in sentence], axis=0)
  except KeyError:
    word2vec_arr[i,:] = np.full((1,300), np.nan)
    continue

word2vec_arr = pd.DataFrame(word2vec_arr)
sentences = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences.dropna(inplace=True)

print("Shape of the dataset: {}".format(sentences.shape))
sentences.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))