### Import Libraries

In [1]:
import nltk
import re
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Data Extraction

In [3]:
# Mount drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Import csv data into dataframe
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Spam Classification Competition/train.csv')

df.head()

Unnamed: 0,label,text,EmailID
0,Ham,eric called me last night and i am happy to co...,0
1,Spam,tr what is funnygadget com funnygadget com ca...,1
2,Ham,"ricky sent me the nom early - 87 , 000 for the...",2
3,Ham,these are the numbers that are being exported ...,3
4,Spam,authentic replica rolex and other watches for ...,4


### Text Tokenization

In [5]:
# Convert column text to lowercase
df['text'] = df['text'].apply(lambda row: row.lower())
df['text']

0        eric called me last night and i am happy to co...
1         tr what is funnygadget com funnygadget com ca...
2        ricky sent me the nom early - 87 , 000 for the...
3        these are the numbers that are being exported ...
4        authentic replica rolex and other watches for ...
                               ...                        
46519    thanks mo original message white stacey w sent...
46520    hello , welcome to the medzo flagstaff nline\n...
46521    here are different kinds of drugs for hair l o...
46522    dear customer do you shop for medications on t...
46523    yap international , inc . ( ypil )\nvoip techn...
Name: text, Length: 46524, dtype: object

In [6]:
# Tokenize the column text
df['text'] = df['text'].apply(lambda row: nltk.word_tokenize(row))
df['text']

0        [eric, called, me, last, night, and, i, am, ha...
1        [tr, what, is, funnygadget, com, funnygadget, ...
2        [ricky, sent, me, the, nom, early, -, 87, ,, 0...
3        [these, are, the, numbers, that, are, being, e...
4        [authentic, replica, rolex, and, other, watche...
                               ...                        
46519    [thanks, mo, original, message, white, stacey,...
46520    [hello, ,, welcome, to, the, medzo, flagstaff,...
46521    [here, are, different, kinds, of, drugs, for, ...
46522    [dear, customer, do, you, shop, for, medicatio...
46523    [yap, international, ,, inc, ., (, ypil, ), vo...
Name: text, Length: 46524, dtype: object

### Text Normalization

In [7]:
# Removing punctuation and non-alphabetic characters
df['text'] = df['text'].apply(lambda row: [re.sub(r'[^a-zA-Z]','', token) for token in row if token.isalpha()])
df['text']

0        [eric, called, me, last, night, and, i, am, ha...
1        [tr, what, is, funnygadget, com, funnygadget, ...
2        [ricky, sent, me, the, nom, early, for, the, w...
3        [these, are, the, numbers, that, are, being, e...
4        [authentic, replica, rolex, and, other, watche...
                               ...                        
46519    [thanks, mo, original, message, white, stacey,...
46520    [hello, welcome, to, the, medzo, flagstaff, nl...
46521    [here, are, different, kinds, of, drugs, for, ...
46522    [dear, customer, do, you, shop, for, medicatio...
46523    [yap, international, inc, ypil, voip, technolo...
Name: text, Length: 46524, dtype: object

In [8]:
# Remove stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda row: [token for token in row if token not in stop_words])
df['text']

0        [eric, called, last, night, happy, cover, atte...
1        [tr, funnygadget, com, funnygadget, com, provi...
2        [ricky, sent, nom, early, weekend, new, number...
3        [numbers, exported, dpr, please, review, notif...
4        [authentic, replica, rolex, watches, men, ladi...
                               ...                        
46519    [thanks, mo, original, message, white, stacey,...
46520    [hello, welcome, medzo, flagstaff, nline, onli...
46521    [different, kinds, drugs, hair, l, pain, relie...
46522    [dear, customer, shop, medications, web, know,...
46523    [yap, international, inc, ypil, voip, technolo...
Name: text, Length: 46524, dtype: object

In [9]:
# Lemmatize tokens
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda row: [lemmatizer.lemmatize(token) for token in row])
df['text']

0        [eric, called, last, night, happy, cover, atte...
1        [tr, funnygadget, com, funnygadget, com, provi...
2        [ricky, sent, nom, early, weekend, new, number...
3        [number, exported, dpr, please, review, notify...
4        [authentic, replica, rolex, watch, men, lady, ...
                               ...                        
46519    [thanks, mo, original, message, white, stacey,...
46520    [hello, welcome, medzo, flagstaff, nline, onli...
46521    [different, kind, drug, hair, l, pain, relief,...
46522    [dear, customer, shop, medication, web, know, ...
46523    [yap, international, inc, ypil, voip, technolo...
Name: text, Length: 46524, dtype: object

In [10]:
# Join tokens back into normalized text
df['text'] = df['text'].apply(lambda row: ' '.join(row))
df['text']

0        eric called last night happy cover attend wast...
1        tr funnygadget com funnygadget com provide lat...
2        ricky sent nom early weekend new number thanks...
3        number exported dpr please review notify chang...
4        authentic replica rolex watch men lady escapen...
                               ...                        
46519    thanks mo original message white stacey w sent...
46520    hello welcome medzo flagstaff nline online pha...
46521    different kind drug hair l pain relief antidep...
46522    dear customer shop medication web know escapen...
46523    yap international inc ypil voip technology req...
Name: text, Length: 46524, dtype: object

### Train/Test Split of Data

In [11]:
# Separating the target Variable
y = df['label']

# Converting to numerical values (0,1)
l = LabelEncoder()
y = l.fit_transform(y)

In [12]:
# Separating the decision variable
x = df['text']

In [13]:
# Split dataset in 80/20 for training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [14]:
# Covert data into TF-IDF vector
vectorizer = TfidfVectorizer(max_features=10000)
x_train_tf = vectorizer.fit_transform(x_train)
x_test_tf = vectorizer.transform(x_test)

### Creating Predictive Models

#### Naive Bayes Model

In [15]:
# Create the Model
classifier = MultinomialNB()

In [16]:
# Train Model
classifier.fit(x_train_tf,y_train)

In [17]:
# Test Model
y_pred = classifier.predict(x_test_tf)
print(confusion_matrix(y_test,y_pred))
print("F1 Score: ",f1_score(y_test,y_pred))

[[4765  165]
 [ 369 4006]]
F1 Score:  0.9375146267259535


#### Support Vector Machine Model

In [18]:
# Creating the Model
svmModel = SVC(kernel="rbf", gamma=0.5, C=1.0)

In [19]:
# Train Model
svmModel.fit(x_train_tf,y_train)

In [20]:
# Test the Model
y_pred2 = svmModel.predict(x_test_tf)
print(confusion_matrix(y_test,y_pred2))
print("F1 Score: ",f1_score(y_test,y_pred2))

[[4800  130]
 [  97 4278]]
F1 Score:  0.9741546168735057


### Making Predictions with the Models

In [21]:
# Import test data to a dataframe
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Spam Classification Competition/test.csv')

df_test.head()

Unnamed: 0,text,EmailID
0,monika try calling ron heller escapenumber esc...,46524
1,"dear all ,\nattached is a paper describing the...",46525
2,arm inc e yo xual des spe ume reas ur se ire r...,46526
3,you are the man http ourmix hk,46527
4,kay mann enron com bmm b b b b b b b escapenum...,46528


In [22]:
# Separate decision variable
x2_test = df_test['text']

# Separate email IDs
emailID = df_test['EmailID']

In [23]:
# Normalize x

# Lowercase
x2_test = x2_test.apply(lambda row: row.lower())

# Tokenize
x2_test = x2_test.apply(lambda row: nltk.word_tokenize(row))

# Remove non alphabetic characters
x2_test = x2_test.apply(lambda row: [re.sub(r'[^a-zA-Z]','', token) for token in row if token.isalpha()])

# Remove stopwords
x2_test = x2_test.apply(lambda row: [token for token in row if token not in stop_words])

# Lemmatize tokens
x2_test = x2_test.apply(lambda row: [lemmatizer.lemmatize(token) for token in row])

# Join tokens back together
x2_test = x2_test.apply(lambda row: ' '.join(row))

x2_test

0        monika try calling ron heller escapenumber esc...
1        dear attached paper describing proposed method...
2        arm inc e yo xual de spe ume reas ur se ire rm...
3                                       man http ourmix hk
4        kay mann enron com bmm b b b b b b b escapenum...
                               ...                        
11626    start date hourahead hour ancillary schedule a...
11627    customer service rep please contact customer p...
11628    legal operating system quarter price office x ...
11629    plan conference call escapenumberpm sunday dis...
11630    original message thomas padron enron sent mond...
Name: text, Length: 11631, dtype: object

In [24]:
# Vectorize x
x2_test_tf = vectorizer.transform(x2_test)

In [None]:
# Make the predictions using the Naive Bayes Model
y2_pred = classifier.predict(x2_test_tf)      # The first Kaggle Submission

In [25]:
# Make the predictions using the Support Vector Machine Model
y2_pred2 = svmModel.predict(x2_test_tf)       # The second Kaggle Submission

In [26]:
# Create Submission dataframe SVM predictions used since it has a better f1 score
df_sub = pd.DataFrame({'EmailID': emailID,'Label': l.inverse_transform(y2_pred2)})

# Export submission data to csv
df_sub.to_csv('submission.csv', index=False)