In [1]:
import numpy as np
import pandas as pd

In [20]:
data = pd.read_csv("/Users/VikrantMehla/EmailClassificationProject/emails.csv")

In [21]:
df1 = pd.DataFrame(data)

In [22]:
df1

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [23]:
Y = data.spam

In [24]:
Y = np.array(Y)
Y

array([1, 1, 1, ..., 0, 0, 0])

In [25]:
X = data.text
X

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object

In [26]:
X = np.array(X)
X

array(["Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : y

## Data Cleaning

In [32]:
from nltk.corpus import stopwords

import string 

stop = stopwords.words("english")
punc = list(string.punctuation)

stop = stop + punc

stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [33]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [34]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stop:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return " ".join(output_words)


In [35]:
from nltk.tokenize import word_tokenize

X = [ clean_review(word_tokenize(w)) for w in X]

In [36]:
X

['subject naturally irresistible corporate identity lt really hard recollect company market full suqgestions information isoverwhelminq good catchy logo stylish statlonery outstanding website make task much easy promise havinq order iogo company automaticaily become world ieader isguite ciear without good product effective business organization practicable aim hotat nowadays market promise marketing effort become much effective list clear benefit creativeness hand make original logo specially do reflect distinctive company image convenience logo stationery provide format easy use content management system letsyou change website content even structure promptness see logo draft within three business day affordability marketing break make gap budget 100 satisfaction guaranteed provide unlimited amount change extra fee surethat love result collaboration look portfolio interested',
 'subject stock trading gunslinger fanny merrill muzo colza attainder penultimate like esmark perspicuous ramb

## Data Splitting

In [37]:
from sklearn.model_selection import train_test_split



In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer


count_vec = CountVectorizer(max_features = 3000 , ngram_range=(1,3)) 

X_train_features = count_vec.fit_transform(X_train)
X_test_features = count_vec.transform(X_test)

In [41]:
count_vec.get_feature_names()



['00',
 '00 00',
 '00 pm',
 '000',
 '000 00',
 '000 000',
 '0000',
 '01',
 '01 2000',
 '01 2001',
 '01 pm',
 '02',
 '02 2000',
 '02 2001',
 '03',
 '03 08',
 '03 2000',
 '04',
 '04 10',
 '04 11',
 '04 19',
 '04 2000',
 '04 2001',
 '04 24',
 '05',
 '05 01',
 '05 02',
 '05 2000',
 '05 2001',
 '0500',
 '06',
 '06 2000',
 '0600',
 '07',
 '07 2000',
 '08',
 '08 2000',
 '09',
 '09 2000',
 '09 2001',
 '09 pm',
 '10',
 '10 00',
 '10 000',
 '10 11',
 '10 2000',
 '10 2001',
 '10 23',
 '10 24',
 '10 30',
 '10 57',
 '10 pm',
 '100',
 '100 000',
 '1092',
 '1092 fax',
 '1092 fax martin',
 '11',
 '11 00',
 '11 2000',
 '11 2001',
 '11 30',
 '12',
 '12 00',
 '12 2000',
 '12 2001',
 '12 month',
 '12 pm',
 '13',
 '13 2000',
 '14',
 '14 2000',
 '14 pm',
 '140',
 '1400',
 '1400 smith',
 '1400 smith street',
 '15',
 '15 2000',
 '15 th',
 '150',
 '16',
 '16 2000',
 '16 2001',
 '16 th',
 '17',
 '17 2000',
 '18',
 '18 2000',
 '18 pm',
 '19',
 '19 2000',
 '19 2001',
 '19 jul',
 '19 jul 2005',
 '19 th',
 '1997',


In [43]:
from sklearn.ensemble import RandomForestClassifier

rf1 = RandomForestClassifier()

In [44]:
rf1.fit(X_train_features, Y_train)

In [46]:
Y_pred = rf1.predict(X_test_features)

In [47]:
Y_pred

array([0, 0, 1, ..., 0, 0, 0])

In [48]:
np.savetxt("pred.csv" , Y_pred , fmt="%s")


In [51]:
from sklearn import metrics

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [52]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[1089    6]
 [  13  324]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1095
           1       0.98      0.96      0.97       337

    accuracy                           0.99      1432
   macro avg       0.99      0.98      0.98      1432
weighted avg       0.99      0.99      0.99      1432

