# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [21]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

X_train.head()
X_train.values

array([["['wake', 'ltgt', 'morning']"],
       ["['cant', 'u', 'try', 'new', 'invention', 'flyim', 'joking']"],
       ["['hi', 'hope', 'u', 'get', 'txtjourney', 'hasnt', 'gdnow', '50', 'mins', 'late', 'think']"],
       ...,
       ["['great', 'comedycant', 'stop', 'laughing', 'da']"],
       ["['pick', 'drop', 'carso', 'problem']"],
       ["['say', 'slowly', 'godi', 'love', 'amp', 'need', 'youclean', 'heart', 'bloodsend', 'ten', 'special', 'people', 'amp', 'u', 'c', 'miracle', 'tomorrow', 'itplspls']"]],
      dtype=object)

### Create TF-IDF Vectors

In [18]:
type(X_train['clean_text'][0])

str

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'wake': 7766,
 'ltgt': 4505,
 'morning': 4866,
 'cant': 1718,
 'try': 7462,
 'new': 5030,
 'invention': 3920,
 'flyim': 3049,
 'joking': 4054,
 'hi': 3582,
 'hope': 3662,
 'get': 3254,
 'txtjourney': 7504,
 'hasnt': 3499,
 'gdnow': 3231,
 '50': 577,
 'mins': 4767,
 'late': 4263,
 'think': 7240,
 'thats': 7204,
 'good': 3328,
 'need': 5003,
 'drugs': 2592,
 'congratulations': 2064,
 'thanks': 7193,
 'friend': 3143,
 '2000': 370,
 'xmas': 8125,
 'prize': 5743,
 'claim': 1914,
 'easy': 2638,
 'call': 1674,
 '08718726978': 130,
 '10p': 258,
 'per': 5458,
 'minute': 4773,
 'btnationalrate': 1602,
 'job': 4037,
 'like': 4354,
 'entrepreneurs': 2745,
 'uhhhhrmm': 7522,
 'isnt': 3952,
 'tb': 7109,
 'test': 7169,
 'bad': 1250,
 'youre': 8213,
 'sick': 6477,
 'ill': 3799,
 'evening': 2789,
 'ideas': 3781,
 'aiyo': 944,
 'always': 987,
 'ex': 2816,
 'one': 5244,
 'dunno': 2614,
 'abt': 813,
 'mei': 4690,
 'reply': 6044,
 'first': 3008,
 'time': 7297,
 'fast': 2911,
 'lucky': 4512,
 'workin': 801

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8262 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.827 / Accuracy: 0.976
