## TFIDF Vectorizer and Random Forest Model

In [1]:
# Importing all the required libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score

pd.set_option('display.max_colwidth', 100)

In [2]:
# Importing the train-test files

X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['You', 'like']"
1,"['I', 'agree', 'So', 'stop', 'thinkin', 'ipad', 'Can', 'please', 'ask', 'macho', 'question']"
2,"['Kkwhere', 'youhow', 'performed']"
3,"['Moji', 'informed', 'saved', 'lives', 'Thanks']"
4,"['Its', 'okcome', 'home', 'vl', 'nice', 'meet', 'v', 'chat']"


## Vectorization

In [3]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [4]:
#

tfidf_vect.vocabulary_

{'you': 8347,
 'like': 4432,
 'agree': 955,
 'so': 6736,
 'stop': 6973,
 'thinkin': 7360,
 'ipad': 3996,
 'can': 1769,
 'please': 5660,
 'ask': 1200,
 'macho': 4628,
 'question': 5950,
 'kkwhere': 4260,
 'youhow': 8353,
 'performed': 5558,
 'moji': 4909,
 'informed': 3934,
 'saved': 6367,
 'lives': 4476,
 'thanks': 7302,
 'its': 4042,
 'okcome': 5324,
 'home': 3707,
 'vl': 7860,
 'nice': 5138,
 'meet': 4757,
 'chat': 1898,
 'evening': 2844,
 'good': 3387,
 'somewhat': 6769,
 'event': 2846,
 'laden': 4308,
 'will': 8086,
 'fill': 3026,
 'dont': 2586,
 'worry': 8182,
 'head': 3586,
 'ok': 5322,
 'throat': 7389,
 'wrecked': 8200,
 'see': 6430,
 'six': 6652,
 'my': 5031,
 'love': 4553,
 'how': 3759,
 'come': 2067,
 'took': 7502,
 'long': 4513,
 'leave': 4379,
 'zahers': 8390,
 'got': 3411,
 'words': 8164,
 'ym': 8339,
 'happy': 3555,
 'sad': 6317,
 'left': 4384,
 'miss': 4861,
 'gettin': 3320,
 'rdy': 6016,
 'ship': 6544,
 'comp': 2079,
 'sorry': 6791,
 'da': 2291,
 'thangam': 7299,
 'held

In [5]:
# Storing the Matrices

X_test_vect[0]

<1x8415 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [6]:
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

## Building the Model

In [7]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [8]:
y_pred = rf_model.predict(X_test_vect)

In [9]:
# Evaluating the Model

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print("Precision: {} -- Recall: {} -- Accuracy: {}".format(precision,
                                                          round(recall, 3),
                                                            round((y_test['label'] == y_pred).sum() / len(y_pred), 3)))

Precision: 1.0 -- Recall: 0.872 -- Accuracy: 0.984
