## Train word2vect model


In [69]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [70]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [71]:
messages['label']=messages['label'].map({'ham':1,'spam':0})

In [72]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

In [73]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [74]:
#  it represents all of the words that our Word2Vec model learned a vector for. 
# Or put another way, it's all of the words that appeared in the training data at least twice. So you can exp
w2v_model.wv.index_to_key 

['you',
 'to',
 'the',
 'and',
 'in',
 'is',
 'me',
 'it',
 'my',
 'your',
 'for',
 'of',
 'call',
 'that',
 'have',
 'on',
 'are',
 'now',
 'can',
 'so',
 'not',
 'or',
 'but',
 'we',
 'do',
 'at',
 'ur',
 'get',
 'be',
 'if',
 'no',
 'just',
 'with',
 'will',
 'lt',
 'gt',
 'this',
 'up',
 'how',
 'ok',
 'when',
 'from',
 'go',
 'what',
 'free',
 'out',
 'll',
 'all',
 'know',
 'like',
 'then',
 'day',
 'got',
 'was',
 'good',
 'am',
 'come',
 'there',
 'he',
 'time',
 'only',
 'its',
 'love',
 'send',
 'txt',
 'text',
 'as',
 'by',
 'want',
 'going',
 'home',
 'don',
 'stop',
 'about',
 'one',
 'see',
 'need',
 'she',
 'lor',
 'today',
 'sorry',
 'still',
 'our',
 'reply',
 'tell',
 'back',
 'they',
 'mobile',
 'da',
 'later',
 'dont',
 'think',
 'take',
 'please',
 'did',
 'new',
 'any',
 'pls',
 'where',
 'hi',
 'week',
 'been',
 'ì_',
 'some',
 'phone',
 'an',
 'her',
 'much',
 'who',
 're',
 'here',
 'night',
 'more',
 'claim',
 'has',
 'oh',
 'well',
 'msg',
 'dear',
 'hey',
 '

In [75]:
# Find the most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('stuff', 0.995053231716156),
 ('always', 0.9949039220809937),
 ('night', 0.9946726560592651),
 ('may', 0.9946215152740479),
 ('an', 0.9946091175079346),
 ('more', 0.9946038722991943),
 ('many', 0.994602382183075),
 ('over', 0.9945899844169617),
 ('dun', 0.9945518970489502),
 ('would', 0.9945430159568787)]

In [76]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  """
  import sys


In [77]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

17 17
4 3
13 13
12 11
4 4
25 25
9 7
6 6
8 6
6 6
12 12
6 6
13 12
6 6
4 4
16 16
9 9
7 6
4 4
3 3
6 6
21 21
7 7
7 7
12 9
5 5
17 13
17 17
16 16
13 12
7 7
16 12
7 7
22 20
19 16
10 10
6 6
21 21
5 4
20 19
2 2
28 25
26 26
15 15
22 16
22 19
9 8
7 7
7 7
26 26
20 17
16 16
23 22
7 7
5 5
28 28
21 20
31 31
14 14
24 23
16 16
7 7
10 10
5 4
24 22
8 8
7 7
4 4
3 3
9 9
26 26
5 4
9 9
34 33
6 6
15 15
25 25
18 16
19 16
4 3
11 10
25 25
21 21
17 17
30 30
4 4
13 12
7 7
9 9
10 10
14 12
23 22
10 9
2 2
6 6
10 10
21 19
21 21
13 13
7 7
17 14
21 20
25 25
16 14
14 14
17 16
3 3
11 11
6 6
1 1
10 10
8 7
5 5
26 24
5 5
22 20
12 12
5 5
5 5
6 5
14 14
10 9
40 35
16 16
8 8
7 7
15 15
5 4
23 22
11 11
4 3
13 13
11 11
9 8
24 23
11 11
4 4
21 21
4 3
17 16
13 13
8 7
22 20
24 24
8 8
24 21
11 11
10 9
3 3
4 4
17 16
9 9
9 9
8 8
25 25
8 8
17 17
21 21
8 7
5 5
15 15
10 9
12 12
13 11
5 5
5 3
23 23
20 20
6 6
7 7
7 7
29 28
8 8
4 4
6 5
4 4
7 7
5 5
10 9
8 8
14 14
9 9
32 31
26 26
5 5
18 17
8 7
18 17
22 22
24 23
9 8
9 9
19 19
2 2
6 6
32 29
30 30
5 

10 9
7 7
29 28
11 11
19 19
13 12
12 12
7 6
27 25
18 17
26 25
7 7
7 6
3 3
14 14
23 23
4 4
3 2
21 9
5 5
13 13
8 8
21 20
7 5
26 25
7 5
12 12
14 14
9 8
7 7
21 20
7 7
7 6
13 13
5 5
14 14
6 6
22 22
16 16
12 12
31 29
20 20
6 6
28 28
7 7
10 9
10 8
11 11
11 10
17 15
19 18
7 7
5 5
8 8
12 12
27 26
3 3
11 10
9 9
34 31
20 20
29 29
15 15
22 22
24 23
13 13
23 23
29 27
18 18
7 6
15 15
30 28
4 4
22 21
23 22
15 15
10 10
5 5
34 33
16 16
18 18
5 4
5 3
7 7
16 12
6 5
9 9
17 16
6 6
24 20
1 0
6 5
12 11
2 2
7 7
12 11
10 8
17 16
27 27
35 35
9 9
4 3
12 12
7 7
8 8
27 27
8 6
25 25
12 12
26 23
19 14
4 4
10 8
5 5
4 4
5 5
17 17
5 5
7 7
26 24
22 21
5 4
26 22
12 12
7 6
7 7
14 14
9 9
19 17
31 28
5 5
29 29
21 20
20 20
13 11
24 24
18 17
23 20
27 25
5 5
17 17
12 12
13 13
16 14
1 1
23 23
11 9
14 11
6 6
15 14
6 6
13 12
4 4
6 6
4 3
20 19
15 15
23 21
25 25
8 8
21 21
9 9
9 8
23 19
28 27
8 8
30 30
13 13
25 16
36 34
9 9
13 13
22 18
21 19
19 19
13 13
21 21
3 2
17 16
18 18
9 9
25 24
8 8
5 5
4 4
29 29
15 10
5 4
13 13
9 9
14 14
10 10

4 4
6 6
3 2
8 7
9 9
7 7
16 14
15 14
9 4
23 23
19 17
11 10
5 5
5 5
29 29
9 9
11 11
8 7
29 29
16 16
23 22
6 6
9 9
26 26
2 2
8 8
4 4
9 9
10 10
24 22
10 8
11 11
26 26
20 20
7 7
14 14
18 18
9 7
10 10
4 3
11 11
5 5
16 15
19 16
4 4
26 24
11 10
16 16
18 17
5 5
27 24
13 12
26 26
10 9
3 3
28 28
2 2
21 21
17 17
12 12
11 11
8 8
17 14
26 24
24 22
7 7
17 15
8 8
6 6
25 25
4 4
5 4
5 5
11 11
25 24
2 2
16 14
7 7
15 14
5 4
1 1
4 4
11 10
9 8
6 0
6 5
2 2
26 19
28 27
19 17
14 10
32 32
15 15
9 9
11 10
5 5
22 20
19 19
55 53
12 12
20 17
8 8
16 14
17 17
4 3
3 2
30 30
27 27
8 8
7 7
9 9
24 23
6 5
5 3
6 6
6 6
8 7
16 14
26 26
8 7
8 8
19 15
28 27
6 5
8 8
20 20
23 18
23 23
18 17
11 10
12 12
25 25
7 7
3 3
4 3
6 6
6 6
15 15
23 23
5 4
10 10
8 8
6 6
4 3
17 14
24 24
14 13
18 17
4 3
23 21
8 7
8 8
33 31
6 6
8 8
24 23
11 11
14 14
12 12
5 4
14 11
18 17
20 20
60 60
9 9
6 6
6 6
12 11
9 9
22 18
11 9
29 29
4 4
22 16
27 27
6 6
18 18
9 9
5 5
10 9
11 11
21 21
9 9
25 25
15 12
6 6
16 15
7 7
5 3
8 8
4 4
17 17
11 10
15 15
8 8
10 10
28 2

In [78]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [79]:
# Are our sentence vector lengths consistent?
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

17 100
4 100
13 100
12 100
4 100
25 100
9 100
6 100
8 100
6 100
12 100
6 100
13 100
6 100
4 100
16 100
9 100
7 100
4 100
3 100
6 100
21 100
7 100
7 100
12 100
5 100
17 100
17 100
16 100
13 100
7 100
16 100
7 100
22 100
19 100
10 100
6 100
21 100
5 100
20 100
2 100
28 100
26 100
15 100
22 100
22 100
9 100
7 100
7 100
26 100
20 100
16 100
23 100
7 100
5 100
28 100
21 100
31 100
14 100
24 100
16 100
7 100
10 100
5 100
24 100
8 100
7 100
4 100
3 100
9 100
26 100
5 100
9 100
34 100
6 100
15 100
25 100
18 100
19 100
4 100
11 100
25 100
21 100
17 100
30 100
4 100
13 100
7 100
9 100
10 100
14 100
23 100
10 100
2 100
6 100
10 100
21 100
21 100
13 100
7 100
17 100
21 100
25 100
16 100
14 100
17 100
3 100
11 100
6 100
1 100
10 100
8 100
5 100
26 100
5 100
22 100
12 100
5 100
5 100
6 100
14 100
10 100
40 100
16 100
8 100
7 100
15 100
5 100
23 100
11 100
4 100
13 100
11 100
9 100
24 100
11 100
4 100
21 100
4 100
17 100
13 100
8 100
22 100
24 100
8 100
24 100
11 100
10 100
3 100
4 100
17 100
9 100
9

8 100
25 100
17 100
14 100
3 100
5 100
6 100
26 100
4 100
36 100
6 100
9 100
8 100
20 100
4 100
7 100
5 100
6 100
20 100
8 100
6 100
6 100
14 100
6 100
16 100
6 100
5 100
12 100
25 100
22 100
11 100
9 100
4 100
16 100
4 100
14 100
11 100
8 100
8 100
28 100
21 100
10 100
22 100
29 100
6 100
19 100
8 100
15 100
9 100
19 100
11 100
14 100
24 100
28 100
29 100
26 100
20 100
68 100
30 100
25 100
8 100
9 100
1 100
11 100
44 100
4 100
22 100
5 100
10 100
12 100
10 100
9 100
29 100
25 100
4 100
12 100
22 100
9 100
7 100
9 100
30 100
24 100
8 100
6 100
26 100
13 100
11 100
13 100
9 100
6 100
28 100
6 100
4 100
25 100
6 100
12 100
6 100
19 100
32 100
4 100
30 100
5 100
29 100
24 100
6 100
31 100
7 100
32 100
10 100
9 100
31 100
7 100
9 100
28 100
8 100
18 100
9 100
10 100
25 100
29 100
21 100
12 100
17 100
23 100
9 100
8 100
8 100
27 100
7 100
9 100
9 100
8 100
7 100
9 100
24 100
10 100
9 100
5 100
12 100
10 100
5 100
5 100
12 100
16 100
9 100
17 100
4 100
21 100
67 100
11 100
16 100
6 100
7 100

6 100
24 100
30 100
12 100
8 100
31 100
10 100
12 100
9 100
31 100
26 100
15 100
46 100
25 100
10 100
6 100
8 100
24 100
17 100
23 100
4 100
22 100
17 100
15 100
9 100
6 100
8 100
21 100
8 100
19 100
9 100
15 100
16 100
5 100
5 100
6 100
25 100
7 100
26 100
8 100
12 100
9 100
14 100
19 100
2 100
15 100
24 100
10 100
4 100
6 100
4 100
19 100
24 100
5 100
22 100
29 100
3 100
10 100
14 100
18 100
22 100
14 100
24 100
9 100
9 100
26 100
5 100
17 100
11 100
16 100
8 100
5 100
5 100
6 100
2 100
5 100
7 100
9 100
81 100
8 100
1 100
6 100
27 100
7 100
9 100
22 100
24 100
16 100
25 100
27 100
11 100
14 100
23 100
22 100
35 100
8 100
5 100
23 100
31 100
10 100
7 100
14 100
23 100
9 100
4 100
6 100
27 100
17 100
17 100
12 100
50 100
12 100
13 100
17 100
8 100
12 100
5 100
12 100
5 100
18 100
4 100
17 100
28 100
15 100
38 100
9 100
8 100
37 100
26 100
6 100
8 100
3 100
30 100
9 100
16 100
17 100
2 100
15 100
24 100
6 100
5 100
8 100
24 100
7 100
101 100
4 100
15 100
26 100
6 100
12 100
9 100
13 10

8 100
8 100
3 100
5 100
6 100
4 100
26 100
10 100
29 100
21 100
23 100
13 100
4 100
24 100
12 100
8 100
10 100
5 100
13 100
25 100
77 100
11 100
4 100
26 100
32 100
22 100
6 100
24 100
15 100
44 100
6 100
7 100
31 100
9 100
5 100
25 100
6 100
25 100
5 100
7 100
10 100


## Fit RandomForestClassifier On Top Of Word Vectors

In [80]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [81]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [84]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.972 / Recall: 0.993 / Accuracy: 0.969
