### Doc2vec Model

In [1]:
import numpy as np
import pandas as pd
import gensim
from sklearn.metrics import precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['You', 'like']"
1,"['I', 'agree', 'So', 'stop', 'thinkin', 'ipad'..."
2,"['Kkwhere', 'youhow', 'performed']"
3,"['Moji', 'informed', 'saved', 'lives', 'Thanks']"
4,"['Its', 'okcome', 'home', 'vl', 'nice', 'meet'..."


In [3]:
# Create tagged document vectors for each text message in the training and test datasets

tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train['clean_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_test['clean_text'])]

In [4]:
tagged_docs_train[:10]

[TaggedDocument(words="['You', 'like']", tags=[0]),
 TaggedDocument(words="['I', 'agree', 'So', 'stop', 'thinkin', 'ipad', 'Can', 'please', 'ask', 'macho', 'question']", tags=[1]),
 TaggedDocument(words="['Kkwhere', 'youhow', 'performed']", tags=[2]),
 TaggedDocument(words="['Moji', 'informed', 'saved', 'lives', 'Thanks']", tags=[3]),
 TaggedDocument(words="['Its', 'okcome', 'home', 'vl', 'nice', 'meet', 'v', 'chat']", tags=[4]),
 TaggedDocument(words="['Evening', 'v', 'good', 'somewhat', 'event', 'laden', 'Will', 'fill', 'dont', 'worry', 'Û', 'Head', 'ok', 'throat', 'wrecked', 'See', 'six']", tags=[5]),
 TaggedDocument(words="['My', 'love', 'How', 'come', 'took', 'long', 'leave', 'Zahers', 'I', 'got', 'words', 'ym', 'happy', 'see', 'sad', 'left', 'I', 'miss']", tags=[6]),
 TaggedDocument(words="['Gettin', 'rdy', 'ship', 'comp']", tags=[7]),
 TaggedDocument(words="['Sorry', 'da', 'thangam', 'sorry', 'held', 'prasad']", tags=[8]),
 TaggedDocument(words="['You', 'supposed', 'wake', 'ME',

In [5]:
# Training a basic doc2vec model

d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                 vector_size=100,
                                 window=5,
                                 min_count=2)

In [6]:
# Infer the vectors to be used in training and testing

train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]

### Building Model

In [7]:
rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

In [8]:
y_pred = rf_model.predict(test_vectors)

In [9]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Precision: {} \nRecall: {} \nAccuracy: {}".format(precision,
                                                        recall,
                                                        (y_test['label'] == y_pred).sum() / len(y_pred)))

Precision: 0.8933333333333333 
Recall: 0.475177304964539 
Accuracy: 0.9264573991031391
