## Word2vec Model

In [12]:
import gensim
import numpy as np
import pandas as pd

pd.set_option('display.max_colwidth', 100)

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['You', 'like']"
1,"['I', 'agree', 'So', 'stop', 'thinkin', 'ipad', 'Can', 'please', 'ask', 'macho', 'question']"
2,"['Kkwhere', 'youhow', 'performed']"
3,"['Moji', 'informed', 'saved', 'lives', 'Thanks']"
4,"['Its', 'okcome', 'home', 'vl', 'nice', 'meet', 'v', 'chat']"


### Creating Word2vec Vectors

In [3]:
# Training a basic word2vec Model

w2v_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [4]:
# Replace the word in text message with the learned word vector
# Index_to_key is an arttribute which shows the words model has learned

words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_train['clean_text']])

X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words]) for ls in X_test['clean_text']])

In [5]:
# Average the word vectors for each sentence (and assign vector of zeros if the model did not learn anything during training)

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis = 0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis = 0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype = float))

In [6]:
X_train_vect[0]

array([[-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419371e-03,
         7.4669169e-03, -6.1676763e-03,  1.1056137e-03,  6.0472824e-03,
        -2.8400517e-03, -6.1735227e-03, -4.1022300e-04, -8.3689503e-03,
        -5.6000138e-03,  7.1045374e-03,  3.3525396e-03,  7.2256685e-03,
         6.8002464e-03,  7.5307419e-03, -3.7891555e-03, -5.6180713e-04,
         2.3483753e-03, -4.5190332e-03,  8.3887316e-03, -9.8581649e-03,
         6.7646410e-03,  2.9144168e-03, -4.9328329e-03,  4.3981862e-03,
        -1.7395759e-03,  6.7113829e-03,  9.9648498e-03, -4.3624449e-03,
        -5.9933902e-04, -5.6956387e-03,  3.8508223e-03,  2.7866268e-03,
         6.8910765e-03,  6.1010956e-03,  9.5384959e-03,  9.2734173e-03,
         7.8980681e-03, -6.9895051e-03, -9.1558648e-03, -3.5575390e-04,
        -3.0998420e-03,  7.8943158e-03,  5.9385728e-03, -1.5456629e-03,
         1.5109634e-03,  1.7900396e-03,  7.8175711e-03, -9.5101884e-03,
        -2.0553112e-04,  3.4691954e-03, -9.3897345e-04,  8.38177

In [7]:
X_train_vect_avg[0]

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419371e-03,
        7.4669169e-03, -6.1676763e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400517e-03, -6.1735227e-03, -4.1022300e-04, -8.3689503e-03,
       -5.6000138e-03,  7.1045374e-03,  3.3525396e-03,  7.2256685e-03,
        6.8002464e-03,  7.5307419e-03, -3.7891555e-03, -5.6180713e-04,
        2.3483753e-03, -4.5190332e-03,  8.3887316e-03, -9.8581649e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328329e-03,  4.3981862e-03,
       -1.7395759e-03,  6.7113829e-03,  9.9648498e-03, -4.3624449e-03,
       -5.9933902e-04, -5.6956387e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384959e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895051e-03, -9.1558648e-03, -3.5575390e-04,
       -3.0998420e-03,  7.8943158e-03,  5.9385728e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900396e-03,  7.8175711e-03, -9.5101884e-03,
       -2.0553112e-04,  3.4691954e-03, -9.3897345e-04,  8.3817719e-03,
      

### Building the Model

In [9]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [11]:
y_pred = rf_model.predict(X_test_vect_avg)

In [14]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precision: {} \nRecall: {} \nAccuracy: {}".format(precision,
                                                        recall, 
                                                        (y_test['label'] == y_pred).sum() / len(y_pred)))

Precision: 0.5454545454545454 
Recall: 0.0851063829787234 
Accuracy: 0.8753363228699551
