In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_csv("kg_train.csv/kg_train.csv",encoding='latin-1')

sub_data_train, sub_data_val, sub_label_train, sub_label_val = train_test_split(train_data, train_data["label"], test_size=0.3, random_state=5)

In [3]:
import string
import re
print(string.punctuation)
print (re.escape(string.punctuation))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
!"\#\$%\&'\(\)\*\+,\-\./:;<=>\?@\[\\\]\^_`\{\|\}\~


In [8]:
# from spellchecker import SpellChecker
from textblob import TextBlob

import nltk
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer

from nltk.corpus import stopwords


patterns = [
re.compile(r'[^\w\d\s\:\)\(]'),
re.compile(r'\s+[a-zA-Z]\s+'),
#re.compile(r'\s+\d+[a-zA-Z]+|[a-zA-Z]+\d+\s+|\s+[a-zA-Z]+\d+[a-zA-Z]+\s+'),
re.compile(r'\s*([0-9])+\s*'),
re.compile(r'\^[a-zA-Z]\s+')]

regexHTML = re.compile(r'<[^>]*?>')

wordnet_lemma  = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(doc):
    list_words = []
    for item in doc.split():
        list_words.append(wordnet_lemma.lemmatize(item,get_wordnet_pos(item)))
    return(' '.join(map(str, list_words)))


def bag_of_words_not_in_set(words, badwords):
    return bag_of_words(set(words) - set(badwords))

def bag_of_non_stopwords(words, stopfile = 'english'):
    badwords = stopwords.words(stopfile)
    return bag_of_words_not_in_set(words, badwords)


def clean_text(text):
    
    # remove all HTML tags
    processed_feature = re.sub(regexHTML, ' ', str(text))
    
    # tags since comments can contain '>' characters
    processed_feature = re.sub(r"(?s)<!--(.*?)-->[\n]?", ' ', processed_feature)
    
    #Remove all the special characters except :, ) and (
    processed_feature = re.sub(patterns[0], ' ', processed_feature)
    
    # remove numbers attached to strings and single numbers
    processed_feature= re.sub(patterns[2], ' ', processed_feature)
    
    # remove all single characters
    #processed_feature= re.sub(patterns[1], ' ', processed_feature)

    # Remove single characters from the start
    #processed_feature = re.sub(patterns[3], ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    
    ######### Correct spelling and grammar: Using Textblob
    #blob = TextBlob(processed_feature)
    #processed_feature = str(blob.correct())

    # Converting to Lowercase
    processed_feature = processed_feature.lower()
    
#     return lemmatize_text(processed_feature)
    return processed_feature




In [None]:
sub_data_train['processed_text'] = sub_data_train['text'].apply(clean_text)

In [14]:
sub_data_train_ham = sub_data_train[sub_data_train['label'] == 0]['processed_text']
sub_data_train_spam = sub_data_train[sub_data_train['label'] == 1]['processed_text']

5650                                                   ok
213     the current revcon document includes a paragra...
2084                                                  fyi
4785                    sorry to confirm they are not in 
4314    from israel radioby shmuel tal prime minister ...
623                                   b release in partb 
2741    h monday january : am kellyc state gov re: ven...
2009    fyi:usibc launches new education initiative to...
4930    i will be in and out of the office this week b...
3498    report: north korea willing to hold talks with...
2151    i ll wait to see him in person next week do we...
3539           fyi background for call sheet you will get
5364                                       yes i will do 
3212                                                  fyi
823      is latest figure based on end fy data (pakist...
2879    jim hoge says that he really would like it by ...
5483    ok i m forwarding you more background from sha...
4061          

In [128]:
# clean_text(train_data['text'][2915])

'mill cheryl d monday january 18 2010 2:26 pmh doug band justin cooper cheryl millsmills cheryl dre: glad you re therediane reynolds mmezvinskyjust leave the hospital really rough and sad mark paul and others do the lord plus 10 worki sent keen an email re no security troop just arrive so the hopefully they will be able to keepb doctor there tonight around the clock cdmsent: mon jan 18 14:22:08 : glad you re therepi give me periodic update about what you re see and do love to all'

In [20]:
from collections import Counter

def build_lexicon(corpus): # define a set with all possible words included in all the sentences or "corpus"
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return lexicon

def freq(term, document):
    return document.split().count(term)

def tf(term, document):
    return freq(term, document)


vocabulary = build_lexicon(sub_data_train_ham)

ham_term_matrix = []
# print ('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']\n')

for doc in sub_data_train_ham:    
#     print ('The doc is "' + doc + '"')
    ham_tf_vector = [tf(word, doc) for word in vocabulary]
    ham_tf_vector_string = ', '.join(format(freq, 'd') for freq in ham_tf_vector)
    
#     print ('The tf vector for Document %d is [%s]|\n' % ((sub_data_train_ham.index(doc)+1), ham_tf_vector_string))
    ham_term_matrix.append(ham_tf_vector)

# print ('\nAll combined, here is our master document term matrix: ')
print (ham_term_matrix)

KeyboardInterrupt: 

In [130]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


# sub_data_train, sub_data_val, sub_label_train, sub_label_val = train_test_split(train_data, train_data["label"], test_size=0.3, random_state=5)

bow_transformer = CountVectorizer().fit(sub_data_train['processed_text'])

X_train = bow_transformer.transform(sub_data_train['processed_text'])
X_val  = bow_transformer.transform(sub_data_val['processed_text'])

print(X_train.shape)
print(X_val.shape)

sub_data_train['processed_text']

(4174, 52203)
(1790, 52203)


5650                                                   ok
213     the current revcon document include a paragrap...
2084                                                  fyi
4785                      sorry to confirm they be not in
4314    from israel radioby shmuel tal prime minister ...
                              ...                        
3046                   we have italready work on schedule
1725    abedin huma friday may 22 2009 4:48 pmcalls ca...
4079    5 rider haggard close jo borg south africa con...
2254    some cat and dogs:1 talk to denis about eu sum...
2915    mill cheryl d monday january 18 2010 2:26 pmh ...
Name: processed_text, Length: 4174, dtype: object

In [131]:
#Learn Classifier
clf = MultinomialNB().fit(X_train, sub_label_train)
#Predict Val data
pred_val = clf.predict(X_val)

accuracy = accuracy_score(sub_label_val,pred_val)
print(accuracy)
confusion_matrix(sub_label_val, pred_val)

0.9553072625698324


array([[939,  65],
       [ 15, 771]])

In [133]:
data_test = pd.read_csv("kg_test.csv/kg_test.csv",encoding='latin-1')
X_test = bow_transformer.transform(data_test['text'].apply(clean_text))
pred_text = clf.predict(X_test)
submission_file = pd.DataFrame({'Id': data_test.index,'Category':pred_text})
submission_file.to_csv('to_submit.csv',index=False)

# To do's

In [None]:
#FIRST: Lemmatizing (DONE)

In [None]:
#SECOND: Spelling and grammar correction

In [None]:
#THIRD: Implement formula to compute efficiency-accuracy ratio

In [None]:
#FOURTH: Weird a's show up in dictionary