In [None]:
import pandas as pd

train_data = pd.read_csv("kg_train.csv/kg_train.csv",encoding='latin-1')

import re
def clean_text(text):
    
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(text))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    
    # remove all single numbers, or all numbers attached to a string
    processed_feature= re.sub(r'\s*([0-9])+\s*', ' ', processed_feature)
    
    # Remove all www's
    processed_feature = re.sub(r'www', ' ',processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    return processed_feature

train_data.loc[:,'preprocessed_text'] = train_data['text'].apply(clean_text)

from collections import Counter

data_ham  = train_data[train_data['label'] == 0].copy() #This saves the messages that are HAM
data_spam = train_data[train_data['label'] == 1].copy() #This saves the messages that are SPAM

words_data_ham  = data_ham['preprocessed_text'] #Series containing the strings that are HAM
words_data_spam = data_spam['preprocessed_text'] #Series containing the strings that are SPAM


list_ham_words = [] #Split all strings in words_data_ham and save them in word-vectors
for sublist in words_data_ham:
    for item in sublist.split():
        list_ham_words.append(item)

list_spam_words = [] #Split all strings in words_data_spam and save them in word-vectors
for sublist in words_data_spam:
    for item in sublist.split():
        list_spam_words.append(item)
        
c_ham  = Counter(list_ham_words)
c_spam = Counter(list_spam_words)

#df_hamwords_top10  = pd.DataFrame(c_ham.most_common(10),  columns=['word', 'count']) #save the top10 most common words in list_ham_words
#df_spamwords_top10 = pd.DataFrame(c_spam.most_common(10), columns=['word', 'count']) #save the top10 most common words in list_spam_words

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

sub_data_train, sub_data_val, sub_label_train, sub_label_val = train_test_split(train_data, train_data["label"], test_size=0.4, random_state=5)

bow_transformer = CountVectorizer().fit(sub_data_train['preprocessed_text'])

X_train = bow_transformer.transform(sub_data_train['preprocessed_text'])
X_val  = bow_transformer.transform(sub_data_val['preprocessed_text'])

print(X_train.shape)
print(X_val.shape)

#Learn Classifier
clf = MultinomialNB().fit(X_train, sub_label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(sub_label_val,pred_val)
print(accuracy)
confusion_matrix(sub_label_val, pred_val)

data_test = pd.read_csv("kg_test.csv/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)
submission_file = pd.DataFrame({'Id': test_data.index,'Category':pred_text})
submission_file.to_csv('to_submit.csv',index=False)