# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['ive', 'reached', 'sch', 'already']"
1,"['wan2', 'win', 'meetgreet', 'westlife', '4', ..."
2,"['oh', 'right', 'ok', 'ill', 'make', 'sure', '..."
3,"['uncle', 'abbey', 'happy', 'new', 'year', 'ab..."
4,"['hard', 'true', 'much', 'show', 'amp', 'expre..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'ive': 3991,
 'reached': 5976,
 'sch': 6341,
 'already': 1005,
 'wan2': 7861,
 'win': 8017,
 'meetgreet': 4705,
 'westlife': 7963,
 'm8': 4574,
 'currently': 2228,
 'tour': 7474,
 '1unbreakable': 369,
 '2untamed': 456,
 '3unkempt': 518,
 'text': 7240,
 '12': 273,
 '83049': 725,
 'cost': 2136,
 '50p': 593,
 'std': 6900,
 'oh': 5266,
 'right': 6175,
 'ok': 5273,
 'ill': 3818,
 'make': 4611,
 'sure': 7067,
 'loads': 4438,
 'work': 8092,
 'day': 2300,
 'got': 3357,
 'really': 5994,
 'nasty': 5011,
 'cough': 2144,
 'today': 7403,
 'dry': 2607,
 'shot': 6528,
 'help': 3566,
 'uncle': 7609,
 'abbey': 816,
 'happy': 3503,
 'new': 5070,
 'year': 8242,
 'abiola': 823,
 'hard': 3504,
 'true': 7524,
 'much': 4947,
 'show': 6536,
 'amp': 1028,
 'express': 2865,
 'love': 4503,
 'someonethat': 6716,
 'hurt': 3766,
 'leave': 4329,
 'get': 3263,
 'seperated': 6429,
 'ud': 7587,
 'evening': 2796,
 'orange': 5345,
 'brings': 1597,
 'ringtones': 6186,
 'time': 7363,
 'chart': 1846,
 'heroes': 3583,
 'fre

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8342 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.794 / Accuracy: 0.971
