In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv("fraudulent-e-mails-spam-or-ham/kg_train.csv/kg_train.csv",encoding='latin-1')

sub_data_train, sub_data_val, sub_label_train, sub_label_val = train_test_split(train_data, train_data["label"], test_size=0.3, random_state=5)

In [2]:
import string
import re

In [3]:
import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

patterns = [
re.compile(r'(?!:\()[^a-zA-Z0-9 ](?<!:\()'),
re.compile(r'\s+[a-zA-Z]\s+'),
re.compile(r'\s+\d+[a-zA-Z]+|[a-zA-Z]+\d+\s+|\s+[a-zA-Z]+\d+[a-zA-Z]+\s+'),
re.compile(r'\^[a-zA-Z]\s+')]

regexHTML = re.compile(r'<[^>]*?>')

wordnet_lemma  = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(doc):
    list_words = []
    for item in doc.split():
        list_words.append(wordnet_lemma.lemmatize(item,get_wordnet_pos(item)))
    return(' '.join(map(str, list_words)))


def clean_text(text):
    
    # remove all HTML tags
    processed_feature = re.sub(regexHTML, ' ', str(text))
    
    # tags since comments can contain '>' characters
    processed_feature = re.sub(r"(?s)<!--(.*?)-->[\n]?", ' ', processed_feature)
    
    # remove numbers attached to strings and single numbers
    processed_feature= re.sub(patterns[2], ' ', processed_feature)
    
    #Remove all the special characters except smileys
    processed_feature = re.sub(patterns[0], ' ', processed_feature)
    
    # remove all single characters
    processed_feature= re.sub(patterns[1], ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(patterns[3], ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)


    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
    return lemmatize_text(processed_feature)

In [4]:
sub_data_train['processed_text'] = sub_data_train['text'].apply(clean_text)
sub_data_val['processed_text'] = sub_data_val['text'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data_train['processed_text'] = sub_data_train['text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_data_val['processed_text'] = sub_data_val['text'].apply(clean_text)


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction import text

#TFIDF For the whole corpus
count_vectorizer = CountVectorizer(min_df=1) 
term_freq_matrix = count_vectorizer.fit_transform(sub_data_train['processed_text'])
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(term_freq_matrix)
tf_idf_matrix = tfidf.transform(term_freq_matrix)
# print idf values
df_idf = pd.DataFrame(tfidf.idf_, index=count_vectorizer.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf_sorted = df_idf.sort_values(by=['idf_weights'])

In [6]:
sub_data_train_ham = sub_data_train.loc[sub_data_train['label']==0, 'processed_text']
sub_data_train_spam = sub_data_train.loc[sub_data_train['label']==1, 'processed_text']

#TFIDF for HAM
count_vectorizer = CountVectorizer(min_df=1) 
term_freq_matrix_ham = count_vectorizer.fit_transform(sub_data_train_ham)
tfidf_ham = TfidfTransformer(norm="l2")
tfidf_ham.fit(term_freq_matrix_ham)
tf_idf_matrix_ham = tfidf_ham.transform(term_freq_matrix_ham)
# print idf values
df_idf_ham = pd.DataFrame(tfidf_ham.idf_, index=count_vectorizer.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf_sorted_ham = df_idf_ham.sort_values(by=['idf_weights'])

#TFIDF for SPAM
term_freq_matrix_spam = count_vectorizer.fit_transform(sub_data_train_spam)
tfidf_spam = TfidfTransformer(norm="l2")
tfidf_spam.fit(term_freq_matrix_spam)
tf_idf_matrix_spam = tfidf_spam.transform(term_freq_matrix_spam)
# print idf values
df_idf_spam = pd.DataFrame(tfidf_spam.idf_, index=count_vectorizer.get_feature_names(),columns=["idf_weights"])
 
# sort ascending
df_idf_sorted_spam = df_idf_spam.sort_values(by=['idf_weights'])

#Restrict to 30% most common words that appear both in ham TFIDF and spam TFIDF
df_idf_sorted_top = df_idf_sorted[:int(df_idf_sorted.size*0.33)].index.tolist()
spam_top = df_idf_sorted_spam[:int(df_idf_sorted_spam.size*0.33)].index.tolist()
ham_top = df_idf_sorted_ham[:int(df_idf_sorted_ham.size*0.33)].index.tolist()

stopwords_top=[]
for word in df_idf_sorted_top:
    if (word in spam_top) and (word in ham_top):
        stopwords_top.append(word)

stopwords_top = set(stopwords_top)

In [7]:
#Apart from getting rid of the most common words in the english language, we also get rid of the most common words in both spam and ham
bow_transformer = CountVectorizer(strip_accents = 'unicode', stop_words = text.ENGLISH_STOP_WORDS.union(stopwords_top)).fit(sub_data_train['processed_text'])

X_train = bow_transformer.transform(sub_data_train['processed_text'])
X_val  = bow_transformer.transform(sub_data_val['processed_text'])

#Learn Classifier
clf = MultinomialNB().fit(X_train, sub_label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(sub_label_val,pred_val)
print(accuracy)

0.9849162011173185


In [None]:
data_test = pd.read_csv("fraudulent-e-mails-spam-or-ham/kg_test.csv/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)
submission_file = pd.DataFrame({'Id': data_test.index,'Category':pred_text})
submission_file.to_csv('to_submit.csv',index=False)