# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [3]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  after removing the cwd from sys.path.
  


In [4]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [5]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[ 0.00386943,  0.00391001, -0.00428015, ..., -0.00496123,
        -0.003332  , -0.00262884],
       [ 0.00386943,  0.00391001, -0.00428015, ..., -0.00496123,
        -0.003332  , -0.00262884],
       [ 0.00386943,  0.00391001, -0.00428015, ..., -0.00496123,
        -0.003332  , -0.00262884],
       ...,
       [-0.00244511,  0.00364336,  0.00209125, ...,  0.00168716,
         0.00290445,  0.00245746],
       [-0.00244511,  0.00364336,  0.00209125, ...,  0.00168716,
         0.00290445,  0.00245746],
       [-0.00244511,  0.00364336,  0.00209125, ...,  0.00168716,
         0.00290445,  0.00245746]], dtype=float32)

In [6]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([ 8.0709215e-05,  3.7500190e-03, -4.5730913e-04, -3.0654238e-03,
       -2.5739593e-03,  2.4147021e-05,  1.7513217e-03,  2.8955305e-03,
       -1.9904219e-03,  6.4477354e-05,  3.0167524e-03, -1.3283795e-03,
        7.6610292e-04, -9.5437100e-04,  2.2420948e-03, -2.4105597e-03,
       -2.0040127e-03,  2.6805636e-03, -2.1485405e-03, -4.5694304e-03,
       -9.7207294e-06,  3.0931921e-04,  1.5777916e-03,  1.7524722e-03,
       -2.6262908e-03, -2.0592778e-03,  6.7556270e-05,  1.0915068e-03,
       -4.1276671e-04,  4.1850368e-03,  7.7684701e-04, -2.6342305e-03,
       -2.4871805e-03, -1.6799330e-03,  1.3087607e-03, -3.2300602e-03,
        7.9952169e-04, -1.7930425e-03,  8.6586049e-04, -3.5678450e-04,
       -1.4278393e-03, -1.3900355e-03, -2.2837145e-03,  5.7547842e-04,
        3.6918945e-03, -3.7255639e-03, -3.5738098e-04,  3.6798113e-03,
       -5.6050584e-04, -1.2699770e-03,  1.7114364e-03, -5.2409462e-04,
        1.1174598e-03, -3.6701427e-03, -1.3466806e-03, -1.6766565e-03,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [7]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())



In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.647 / Recall: 0.219 / Accuracy: 0.878
