<p> The goal of this first line is to check that everything works and take a look at the train data </p>

In [25]:
import pandas as pd

train_data = pd.read_csv("kg_train.csv/kg_train.csv",encoding='latin-1')
test_data = pd.read_csv("kg_test.csv/kg_test.csv", encoding ='latin-1')

print("Training data shape: ", train_data.shape)
print("Test data shape: ",test_data.shape)
print(train_data.head(), "\n")
print(test_data.head())

Training data shape:  (5964, 2)
Test data shape:  (5964, 1)
                                                text  label
0  DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...      1
1                                           Will do.      0
2  Nora--Cheryl has emailed dozens of memos about...      0
3  Dear Sir=2FMadam=2C I know that this proposal ...      1
4                                                fyi      0 

                                                text
0  usiness is for the fact that the deceased man ...
1  They are happy to adjust to the afternoon. I a...
2  Lael Brainard was confirmed 78-19 this afterno...
3  H <hrod17@clintonemail.com>Friday March 26 201...
4  n;"> Dear Good Friend,<br><br><br>I am happy t...


<p> Let us start with the pre-processing </p>

In [26]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each']


In [27]:
import re
def clean_text(text):
    
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(text))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    return processed_feature

<p> We might have to improve the code above, as it might be cleaning more than it needs to, but I don't think so.  </p>

In [28]:
train_data.loc[:,'preprocessed_text'] = train_data['text'].apply(clean_text)
train_data.loc[:,'preprocessed_text'] = train_data['text'].apply(clean_text)

In [29]:
train_data.head(20)

Unnamed: 0,text,label,preprocessed_text
0,"DEAR SIR, STRICTLY A PRIVATE BUSINESS PROPOSAL...",1,dear sir strictly private business proposal am...
1,Will do.,0,will do
2,Nora--Cheryl has emailed dozens of memos about...,0,nora cheryl has emailed dozens of memos about ...
3,Dear Sir=2FMadam=2C I know that this proposal ...,1,dear sir 2fmadam 2c know that this proposal mi...
4,fyi,0,fyi
5,sure -- bottom line - you need a special secur...,0,sure bottom line you need special security cod...
6,"Dear Sir,I am Engr. Ugo Nzego with the Enginee...",1,dear sir am engr ugo nzego with the engineerin...
7,Abedin Huma <AbedinH@state.gov>Saturday Novemb...,0,abedin huma abedinh state gov saturday novembe...
8,There is an Oct 16th George Marshall event at ...,0,there is an oct 16th george marshall event at ...
9,<P>1 25% for you as the account owner <BR>2 65...,1,1 25 for you as the account owner br 2 65 for...


In [30]:
from collections import Counter

data_ham  = train_data[train_data['label'] == 0].copy() #This saves the messages that are HAM
data_spam = train_data[train_data['label'] == 1].copy() #This saves the messages that are SPAM

words_data_ham  = data_ham['preprocessed_text'] #Series containing the strings that are HAM
words_data_spam = data_spam['preprocessed_text'] #Series containing the strings that are SPAM


list_ham_words = [] #Split all strings in words_data_ham and save them in word-vectors
for sublist in words_data_ham:
    for item in sublist.split():
        list_ham_words.append(item)

list_spam_words = [] #Split all strings in words_data_spam and save them in word-vectors
for sublist in words_data_spam:
    for item in sublist.split():
        list_spam_words.append(item)
        
c_ham  = Counter(list_ham_words)
c_spam = Counter(list_spam_words)

df_hamwords_top10  = pd.DataFrame(c_ham.most_common(10),  columns=['word', 'count']) #save the top10 most common words in list_ham_words
df_spamwords_top10 = pd.DataFrame(c_spam.most_common(10), columns=['word', 'count']) #save the top10 most common words in list_spam_words

<p> From now on, we can't change the code </p>

In [39]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

data_train, data_val, label_train, label_val = train_test_split(train_data, train_data["label"], test_size=0.3, random_state=5)

bow_transformer = CountVectorizer().fit(data_train['preprocessed_text'])

X_train = bow_transformer.transform(data_train['preprocessed_text'])
X_val  = bow_transformer.transform(data_val['preprocessed_text'])

print(X_train.shape)
print(X_val.shape)

#Learn Classifier
clf = MultinomialNB().fit(X_train, label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(label_val, pred_val)
print(accuracy)
confusion_matrix(label_val, pred_val)

(4174, 65759)
(1790, 65759)
0.9597765363128492


array([[946,  58],
       [ 14, 772]])

In [46]:
data_test = pd.read_csv("kg_test.csv/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)
submission_file = pd.DataFrame({'Id': test_data.index,'Category':pred_text})
submission_file.to_csv('to_submit.csv',index=False)