# Bag Of Words

In [535]:
import pandas as pd

In [536]:
messages = pd.read_csv('./spam_or_not_spam.csv')

In [537]:
messages.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [538]:
messages['label'].value_counts()

label
0    2500
1     500
Name: count, dtype: int64

In [539]:
messages[messages['label'] == 1].head(10)

Unnamed: 0,email,label
2500,save up to NUMBER on life insurance why spend...,1
2501,NUMBER fight the risk of cancer URL NUMBER sli...,1
2502,NUMBER fight the risk of cancer URL NUMBER sli...,1
2503,adult club offers free membership instant acc...,1
2504,i thought you might like these NUMBER slim dow...,1
2505,a powerhouse gifting program you don t want to...,1
2506,help wanted we are a NUMBER year old fortune N...,1
2507,hyperlink life can change in an instant that ...,1
2508,tired of the bull out there want to stop losin...,1
2509,dear ricardoNUMBER cost effective direct email...,1


In [540]:
## Data Cleaning and Preprocessing
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diegoagd10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/diegoagd10/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/diegoagd10/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [541]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [542]:
ps = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [543]:
stopwords_set = set(stopwords.words('english'))

In [544]:
def clean_text(text):
    new_text = text.lower()
    new_text = re.sub('[^a-zA-z]', ' ', new_text)
    new_text = new_text.split()
    new_text = [lemmatizer.lemmatize(word) for word in new_text]
    new_text = ' '.join(new_text)
    return new_text

In [545]:
corpus = []

for i in range(0, len(messages)):
    text = str(messages['email'][i])
    corpus.append(clean_text(text))

corpus

['date wed number aug number number number number number from chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com i can t reproduce this error for me it is very repeatable like every time without fail this is the debug log of the pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number ftoc_pickmsgs number hit number number number marking number hit number number number tkerror syntax error in expression int note if i run the pick command by hand delta pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number hit that s where the number hit come from obviously the version of nmh i m using is delta pick version pick nmh number number number compiled on url at sun mar number number nu

In [546]:
def find_empty_sentences_from(corpus):
    # Calculate lengths of elements in corpus
    lengths = list(map(len, corpus))
    # Combine lengths, corpus elements, and email messages
    combined = zip(lengths, corpus, messages['email'], range(len(corpus)))

    return [[length, element, email, index] for length, element, email, index in combined if length < 1]

empty_sentences = find_empty_sentences_from(corpus)
for length, element, email, index in empty_sentences:
    print(f"Index: {index}")
    print(f"Length: {length}")
    print(f"Element: {element}")
    print(f"Email: {email}")
    print()

Index: 2806
Length: 0
Element: 
Email:  

Index: 2828
Length: 0
Element: 
Email:  



In [547]:
# Shows the records from messages that have empty sentences based the index
print("Email for index 2806: " + (messages.loc[2806]['email']))
print("Email for index 2828: " + (messages.loc[2806]['email']))

Email for index 2806:  
Email for index 2828:  


In [548]:
len(corpus)

3000

In [549]:
## Create Word2Vec Model
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [550]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [551]:
vec_king = wv['king']

In [552]:
vec_king.shape

(300,)

In [553]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [554]:
words = []

for sentence in corpus:
    sent_token = sent_tokenize(sentence)
    for s in sent_token:
        words.append(simple_preprocess(s))

In [555]:
words

[['date',
  'wed',
  'number',
  'aug',
  'number',
  'number',
  'number',
  'number',
  'number',
  'from',
  'chris',
  'garrigues',
  'cwg',
  'dated',
  'number',
  'numberfanumberd',
  'deepeddy',
  'com',
  'message',
  'id',
  'number',
  'number',
  'tmda',
  'deepeddy',
  'vircio',
  'com',
  'can',
  'reproduce',
  'this',
  'error',
  'for',
  'me',
  'it',
  'is',
  'very',
  'repeatable',
  'like',
  'every',
  'time',
  'without',
  'fail',
  'this',
  'is',
  'the',
  'debug',
  'log',
  'of',
  'the',
  'pick',
  'happening',
  'number',
  'number',
  'number',
  'pick_it',
  'exec',
  'pick',
  'inbox',
  'list',
  'lbrace',
  'lbrace',
  'subject',
  'ftp',
  'rbrace',
  'rbrace',
  'number',
  'number',
  'sequence',
  'mercury',
  'number',
  'number',
  'number',
  'exec',
  'pick',
  'inbox',
  'list',
  'lbrace',
  'lbrace',
  'subject',
  'ftp',
  'rbrace',
  'rbrace',
  'number',
  'number',
  'sequence',
  'mercury',
  'number',
  'number',
  'number',
  'fto

In [556]:
len(words)

2998

In [557]:
## Lets train Word2Vec from Scratch
## words is an array of sentences where each sentence is an array of words
## super important for training and creating our custom word2vec model
model = Word2Vec(words, vector_size=100)

In [558]:
## To get all the Vocabulary
model.wv.index_to_key

['number',
 'the',
 'to',
 'and',
 'of',
 'in',
 'url',
 'is',
 'it',
 'that',
 'you',
 'for',
 'this',
 'on',
 'with',
 'be',
 'have',
 'are',
 'from',
 'not',
 'or',
 'your',
 'at',
 'by',
 'if',
 'but',
 'we',
 'can',
 'wa',
 'list',
 'an',
 'will',
 'all',
 'my',
 'they',
 'so',
 'one',
 'ha',
 'get',
 'do',
 'more',
 'there',
 'time',
 'our',
 'no',
 'just',
 'out',
 'about',
 'what',
 'people',
 'which',
 'email',
 'their',
 'like',
 'up',
 'message',
 'use',
 'would',
 'only',
 'new',
 'he',
 'who',
 'mail',
 'any',
 'free',
 'now',
 'some',
 'me',
 'when',
 'don',
 'other',
 'make',
 'here',
 'been',
 'user',
 'rpm',
 'mailing',
 'how',
 'than',
 'them',
 'year',
 'wrote',
 'also',
 'work',
 'then',
 'date',
 'state',
 'way',
 'world',
 'said',
 'had',
 'file',
 'his',
 'because',
 'hyperlink',
 're',
 'into',
 'group',
 'over',
 'spamassassin',
 'want',
 'think',
 'were',
 'money',
 'first',
 'could',
 'spam',
 'company',
 'right',
 'information',
 'these',
 'know',
 'should',

In [559]:
model.corpus_count

2998

In [560]:
# The bigger the result, the better the model
model.epochs

5

In [561]:
model.wv.similar_by_word('kid')

[('themselves', 0.9365184903144836),
 ('kill', 0.9333265423774719),
 ('throwing', 0.9314576983451843),
 ('job', 0.9277185797691345),
 ('trouble', 0.9267697334289551),
 ('big', 0.9224991202354431),
 ('memory', 0.9216810464859009),
 ('hurry', 0.9197338819503784),
 ('serious', 0.9196022152900696),
 ('playing', 0.9136622548103333)]

In [562]:
model.wv.similar_by_word('good')

[('too', 0.9321675300598145),
 ('pretty', 0.925357460975647),
 ('enough', 0.917788028717041),
 ('hard', 0.9118716716766357),
 ('bit', 0.9093148708343506),
 ('doing', 0.9062215685844421),
 ('very', 0.8981969356536865),
 ('happy', 0.8958097696304321),
 ('little', 0.8949092626571655),
 ('true', 0.8937450647354126)]

In [563]:
model.wv['good']

array([-0.21759136,  0.92434233,  0.40097708,  0.00812275,  0.15764795,
       -1.0850613 ,  0.51430476,  0.25574172, -0.26278394, -0.9876489 ,
       -0.00964991, -1.667435  , -0.24816841, -0.30012614,  0.39193076,
        0.10786973,  0.14564629, -0.5773297 ,  0.15219085, -0.93797493,
       -0.27207193,  0.41299903, -0.31644472, -0.5112705 , -0.44338644,
       -0.06626895, -0.15870355, -0.5889502 , -0.5280234 ,  0.36527252,
        1.0154167 ,  0.32747838,  0.14103061, -0.6467631 , -1.211581  ,
        1.5153797 ,  0.59544647,  0.31371316, -0.03848867, -0.85754657,
        0.3722029 , -0.2031957 ,  0.12743399, -0.01253018,  0.7917396 ,
       -0.4164    , -0.47366437, -0.20052934,  0.57347393,  0.06476499,
        0.0382398 , -0.70309025,  0.1465494 , -0.27000463,  0.05382624,
        0.4396956 ,  0.2759535 ,  0.37022534, -0.14766674, -0.32740477,
       -0.25020105, -0.2476809 , -0.08192757, -0.12723437, -0.6349376 ,
        0.7824039 , -0.6406218 ,  1.122959  , -0.76669455,  0.14

In [564]:
model.wv['good'].shape

(100,)

In [565]:
# For each word from the following sentence, we will get a vector of 100 dimensions
# The reason of why we need to use Avg Word2Vec is because we need to reduce the dimensionality of the data to one dimension
# Doing the above, will help the classification model to work better
print((len(words[0]), words[0]))
print((len(words[1]), words[1]))

(244, ['date', 'wed', 'number', 'aug', 'number', 'number', 'number', 'number', 'number', 'from', 'chris', 'garrigues', 'cwg', 'dated', 'number', 'numberfanumberd', 'deepeddy', 'com', 'message', 'id', 'number', 'number', 'tmda', 'deepeddy', 'vircio', 'com', 'can', 'reproduce', 'this', 'error', 'for', 'me', 'it', 'is', 'very', 'repeatable', 'like', 'every', 'time', 'without', 'fail', 'this', 'is', 'the', 'debug', 'log', 'of', 'the', 'pick', 'happening', 'number', 'number', 'number', 'pick_it', 'exec', 'pick', 'inbox', 'list', 'lbrace', 'lbrace', 'subject', 'ftp', 'rbrace', 'rbrace', 'number', 'number', 'sequence', 'mercury', 'number', 'number', 'number', 'exec', 'pick', 'inbox', 'list', 'lbrace', 'lbrace', 'subject', 'ftp', 'rbrace', 'rbrace', 'number', 'number', 'sequence', 'mercury', 'number', 'number', 'number', 'ftoc_pickmsgs', 'number', 'hit', 'number', 'number', 'number', 'marking', 'number', 'hit', 'number', 'number', 'number', 'tkerror', 'syntax', 'error', 'in', 'expression', 'in

In [566]:
import numpy as np

In [567]:
def avg_word2vec(sentence):
    # Remove out-of-vocabulary words
    return np.mean([model.wv[word] for word in sentence if word in model.wv.index_to_key], axis=0)

In [568]:
from tqdm import tqdm

In [569]:
# Apply for the entire sentences
X = []
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 2998/2998 [00:05<00:00, 552.75it/s]


In [570]:
print((len(X[0]), X[0]))
print((len(X[1]), X[1]))

(100, array([ 0.27230164, -0.19708706, -0.18299757,  0.07671565,  0.20000283,
       -0.48290333,  0.15485485,  0.773242  , -0.5045478 , -0.18433802,
       -0.04198749, -0.63903487,  0.25392106,  0.4857368 ,  0.29855844,
       -0.2549604 ,  0.27026126, -0.06273282, -0.57225424, -0.51182115,
        0.00399922,  0.13520636,  0.19457841, -0.13812047, -0.21076173,
        0.0522388 ,  0.03014749, -0.0232875 , -0.5465294 , -0.40806505,
        0.44815513, -0.67220527,  0.02874167, -0.52109694, -0.19235049,
        0.01214188,  0.10739995, -0.04025365, -0.45711216, -0.4475957 ,
        0.01719715, -0.10500908, -0.11999488,  0.23162884,  0.24894848,
        0.08035748,  0.07522559, -0.7036122 ,  0.18602717, -0.12085649,
       -0.11936861, -0.15252762,  0.15615612, -0.06410437,  0.3424995 ,
        0.20009132, -0.32884002,  0.05303133, -0.10554087, -0.16668938,
        0.31954992, -0.3054448 ,  0.397221  ,  0.09841405, -0.5413983 ,
        0.22323573, -0.24936926, -0.27698475, -0.13892207,

In [571]:
## Creating the dataset of independent features
df = pd.DataFrame()
for i in range(0, len(X)):
    df = pd.concat([df, pd.DataFrame(X[i].reshape(1, -1))], ignore_index=True)

  df = pd.concat([df, pd.DataFrame(X[i].reshape(1, -1))], ignore_index=True)


In [572]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.272302,-0.197087,-0.182998,0.076716,0.200003,-0.482903,0.154855,0.773242,-0.504548,-0.184338,...,0.929289,0.494869,0.220386,0.138945,0.67787,0.458252,0.06783,-0.584729,-0.15535,-0.03595
1,0.16673,-0.067104,-0.151777,-0.125419,0.249559,-0.216651,0.099244,0.575571,-0.239864,-0.108591,...,0.670422,0.321869,0.024745,0.192151,0.386894,0.250816,0.162697,-0.409683,-0.043148,-0.138142
2,-0.100122,0.139754,-0.032526,-0.024459,0.200787,-0.226884,0.089388,0.569608,-0.256445,-0.078069,...,0.545069,0.361499,0.115754,0.080459,0.508423,0.239047,0.138792,-0.199424,0.057092,0.095856
3,-0.087691,0.171064,-0.067928,-0.105919,0.250657,-0.397355,0.107915,0.560022,-0.307983,-0.225278,...,0.677527,0.373427,0.13088,0.038752,0.497552,0.297525,0.16264,-0.0914,0.05954,0.060017
4,0.142702,0.024575,-0.12024,-0.237234,0.227771,-0.21999,0.064732,0.481951,-0.294686,-0.124393,...,0.677094,0.332466,-0.026518,0.080214,0.256001,0.201324,0.094469,-0.190074,0.095,-0.125822


In [573]:
df[df.isnull().any(axis=1)]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2964,,,,,,,,,,,...,,,,,,,,,,


In [574]:
## Defininf the independent features
df = df.dropna()
X = df

In [575]:
## Defining the dependent feature
y = messages['label'].drop([2806, 2828, 2964])

In [576]:
(X.shape, y.shape)

((2997, 100), (2997,))

In [577]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [578]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
2209,0.420582,-0.318184,-0.173862,0.195708,0.230799,-0.839922,0.259359,1.04394,-0.61049,-0.345927,...,1.391226,0.659759,0.455877,0.167499,1.123897,0.74666,0.113126,-1.08589,-0.116484,-0.151311
271,0.002057,0.17789,-0.033088,-0.230874,0.201172,-0.424468,0.063085,0.367899,-0.347912,-0.245497,...,0.708187,0.395934,-0.015924,0.064497,0.293299,0.366312,0.138109,-0.053558,0.156107,-0.125759
2903,0.178882,-0.052806,-0.26944,-0.218841,0.591731,-0.293446,-0.366161,0.371905,-0.409454,0.077624,...,0.747455,0.374647,0.159313,-0.038695,-0.40471,0.573792,0.509584,-0.646865,-0.068035,-0.695694
369,0.235479,0.279606,-0.04616,-0.273805,-0.015244,-0.473421,0.143587,0.429343,-0.428854,-0.24741,...,0.696356,0.311281,0.004887,-0.006863,0.151549,0.248013,-0.043816,-0.052832,0.366294,-0.166044
2886,-0.038231,0.211354,-0.044837,-0.102517,0.247231,-0.264182,0.127905,0.550689,-0.285333,-0.157863,...,0.617304,0.322971,0.090883,-0.005705,0.442574,0.255229,0.216484,-0.051725,0.125124,0.020951


In [579]:
y_train

2209    0
271     0
2905    1
369     0
2888    1
       ..
763     0
835     0
1653    0
2607    1
2732    1
Name: label, Length: 2397, dtype: int64

In [580]:
## Training model using Naive bayes classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [581]:
def print_metrics(train, predicted, title):
    accuracy = accuracy_score(train, predicted)
    precision = precision_score(train, predicted)
    recall = recall_score(train, predicted)
    f1 = f1_score(train, predicted)
    roc_auc = roc_auc_score(train, predicted)
    cm = confusion_matrix(train, predicted)

    print('************************************************\n')
    print(title)
    print('- Accuracy: {:.4f}'.format(accuracy))
    print('- Precision: {:4f}'.format(precision))
    print('- Recall: {:4f}'.format(recall))
    print('- F1 Score: {:.4f}'.format(f1))
    print('- ROC AUC Score: {:4f}'.format(roc_auc))

    print('\nConfusion Matrix:\n')
    print(cm)
    print('\n************************************************')

In [582]:
rfc_model = RandomForestClassifier().fit(X_train, y_train)

y_train_pred = rfc_model.predict(X_train)
y_test_pred = rfc_model.predict(X_test)

print_metrics(y_train, y_train_pred, 'Model performance on Training set')
print_metrics(y_test, y_test_pred, 'Model performance on Test set')

************************************************

Model performance on Training set
- Accuracy: 1.0000
- Precision: 1.000000
- Recall: 1.000000
- F1 Score: 1.0000
- ROC AUC Score: 1.000000

Confusion Matrix:

[[1993    0]
 [   0  404]]

************************************************
************************************************

Model performance on Test set
- Accuracy: 0.9783
- Precision: 0.976190
- Recall: 0.881720
- F1 Score: 0.9266
- ROC AUC Score: 0.938888

Confusion Matrix:

[[505   2]
 [ 11  82]]

************************************************


In [None]:
## Predicting the new emails
new_emails = [
    'You have won a lottery. Claim your prize now. A lot of money is waiting for you.',
    'I love to see the sky with you',
    'I have photos of you which could compromise you. Send me money or I will publish them.',
    'Hi I am Peter, I am a recruiter at XL company. I have a job offer for you. Please contact me.',
    'Dragon Ball new episode is out. Watch it now.',
    'Give me money or you are dead.',
    'Click to claim your prize now.'
]

In [637]:
## Cleaning the new emails
new_emails_cleaned = [clean_text(email) for email in new_emails]
new_emails_cleaned

['you have won a lottery claim your prize now a lot of money is waiting for you',
 'i love to see the sky with you',
 'i have photo of you which could compromise you send me money or i will publish them',
 'hi i am peter i am a recruiter at xl company i have a job offer for you please contact me',
 'dragon ball new episode is out watch it now',
 'give me money or you are dead',
 'click to claim your prize now',
 'dear diego another update but this one is not good news yesterday a federal judge issued a decision in the arnold case brought by the kusk firm akiva cohen dismissing their breach of contract claim based on the merger agreement and third party beneficiary status the same claim we won on just before thanksgiving that i last updated you about we informed the court in our federal case the cornet case of our winning decision on this claim and we have been informing arbitrator in our case about our decision and asking them to apply re judicata meaning asking them to rule that we wi

In [638]:
## Creating the new email tokens
email_words = []

for sentence in new_emails_cleaned:
    sent_token = sent_tokenize(sentence)
    for s in sent_token:
        email_words.append(simple_preprocess(s))

In [639]:
email_words

[['you',
  'have',
  'won',
  'lottery',
  'claim',
  'your',
  'prize',
  'now',
  'lot',
  'of',
  'money',
  'is',
  'waiting',
  'for',
  'you'],
 ['love', 'to', 'see', 'the', 'sky', 'with', 'you'],
 ['have',
  'photo',
  'of',
  'you',
  'which',
  'could',
  'compromise',
  'you',
  'send',
  'me',
  'money',
  'or',
  'will',
  'publish',
  'them'],
 ['hi',
  'am',
  'peter',
  'am',
  'recruiter',
  'at',
  'xl',
  'company',
  'have',
  'job',
  'offer',
  'for',
  'you',
  'please',
  'contact',
  'me'],
 ['dragon', 'ball', 'new', 'episode', 'is', 'out', 'watch', 'it', 'now'],
 ['give', 'me', 'money', 'or', 'you', 'are', 'dead'],
 ['click', 'to', 'claim', 'your', 'prize', 'now'],
 ['dear',
  'diego',
  'another',
  'update',
  'but',
  'this',
  'one',
  'is',
  'not',
  'good',
  'news',
  'yesterday',
  'federal',
  'judge',
  'issued',
  'decision',
  'in',
  'the',
  'arnold',
  'case',
  'brought',
  'by',
  'the',
  'kusk',
  'firm',
  'akiva',
  'cohen',
  'dismissing'

In [640]:
# Creating the vectors for the email tokens
email_vectors = []
for i in tqdm(range(len(email_words))):
    email_vectors.append(avg_word2vec(email_words[i]))

100%|██████████| 8/8 [00:00<00:00, 392.85it/s]


In [641]:
len(email_vectors)

8

In [642]:
len(email_vectors[0])

100

In [643]:
email_df = pd.DataFrame()
for i in range(0, len(email_vectors)):
    email_df = pd.concat([email_df, pd.DataFrame(email_vectors[i].reshape(1, -1))], ignore_index=True)

In [644]:
email_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.096837,0.414765,0.033691,-0.211865,0.216738,-0.487738,0.097366,0.550636,-0.423987,-0.466896,...,0.712139,0.141733,0.104442,-0.110756,0.03077,0.366481,0.152978,0.069235,0.155589,-0.245317
1,0.058013,0.493837,-0.164177,-0.436578,0.079214,-0.182713,-0.261067,0.3098,-0.474343,-0.234131,...,0.819841,0.308234,0.105611,-0.277737,0.005252,0.292143,0.19779,0.180926,0.367477,-0.232635
2,0.232369,0.364751,-0.077383,-0.496725,0.346455,-0.467359,0.149779,0.495998,-0.486275,-0.406587,...,0.875153,0.249604,0.167613,-0.26606,0.136731,0.454553,0.208313,0.111061,0.210109,-0.314387
3,0.425356,-0.159849,-0.163848,-0.025298,0.140732,-0.507427,0.011793,0.30967,-0.455259,-0.298492,...,0.805771,0.245823,-0.101107,-0.008474,0.169988,0.37137,0.029684,-0.287102,0.096431,-0.210783
4,-0.031821,0.358707,0.137682,0.349653,0.037942,-0.074437,0.140508,0.220187,-0.338521,-0.248838,...,0.15993,0.06605,-0.057661,0.139035,-0.037277,0.155324,0.004115,-0.188764,0.260445,-0.125953
5,0.519028,0.737833,-0.010139,-0.30488,0.157412,-0.633056,0.205207,0.374634,-0.546786,-0.604688,...,0.518924,0.269178,0.097535,-0.135013,0.109965,0.513076,0.088959,-0.113394,0.309195,-0.433556
6,0.042278,0.262427,-0.394848,0.104958,0.529808,-0.372014,-0.177473,0.483946,-0.274418,0.210406,...,0.538246,0.142679,0.309568,-0.230457,-0.474483,0.274461,0.437586,-0.324751,-0.071477,-0.774519
7,-0.068018,0.32905,0.022676,-0.284673,0.240802,-0.396368,0.046458,0.420903,-0.318502,-0.254842,...,0.690852,0.307296,0.062559,-0.107525,0.235389,0.258218,0.202103,0.174424,0.269049,-0.062727


In [645]:
## Make predictions
predictions = rfc_model.predict(email_df)
predictions

array([1, 0, 1, 0, 0, 0, 1, 0])