TODO: 
Clean Strings
...


In [1]:
import sys
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import time, random, re, pprint
from itertools import islice, chain
pd.set_option('display.max_columns', None)
print(sys.version_info)

sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)


## Reading Data and Cleaning

In [2]:
%%time
start = time.time()
train_input = pd.read_csv('train_input.csv')
train_output = pd.read_csv('train_output.csv')

CPU times: user 1.04 s, sys: 177 ms, total: 1.22 s
Wall time: 1.33 s


In [3]:
print(train_input.shape)
train_input.head(2)

(165000, 2)


Unnamed: 0,id,conversation
0,0,<speaker_1> seaworld ceo steps down amid tanki...
1,1,<speaker_1> strickland chargers owner dean spa...


In [4]:
delchars = ''.join(c for c in map(chr, range(256)) if not c.isalpha())
delchars = ''.join(ch for ch in delchars if ch !=' ')

def clean_text(paragraph):
    output = re.sub('\<.*?\>','', paragraph) #delete tags (between <..>)
    output = re.sub('\@.*?\s','', output) #delete usernames (words following @)
    output = re.sub('\n','', output) #delete anything following a slash
    output = output.translate(str.maketrans('','',delchars))
    return output

def rem_stopwords(word_list): 
    return list(set(word_list) - set(stopwords.words('English')))

count_stops = Counter(stopwords.words("English")*100)

def rem_stopwords_count(words_list):
    return Counter(words_list) - count_stops

wnl = WordNetLemmatizer()
def lemma_map(word_list):
    return list(map(lambda y: wnl.lemmatize(y), word_list))

def generate_features(dataframe):
    dataframe["text"] = dataframe["conversation"].apply(lambda x: clean_text(x))
    dataframe["words"] = dataframe["text"].apply(lambda x: x.split()) 
#    dataframe["words_lems"] = dataframe["words"].apply(lambda x: lemma_map(x)) # If we want to do lemmatization..
#    dataframe["words_count"] = dataframe["words_lems"].apply(lambda x: rem_stopwords_count(x))
    dataframe["words_count"] = dataframe["words"].apply(lambda x: rem_stopwords_count(x))
    return dataframe

In [5]:
%%time
train_input = generate_features(train_input)
train_input["output"] = train_output["category"]
#strip out all common words (stopwords, which we import from the nltk package)

CPU times: user 21 s, sys: 750 ms, total: 21.7 s
Wall time: 21.9 s


In [6]:
train_input.head()

Unnamed: 0,id,conversation,text,words,words_count,output
0,0,<speaker_1> seaworld ceo steps down amid tanki...,seaworld ceo steps down amid tanking revenues...,"[seaworld, ceo, steps, down, amid, tanking, re...","{'propaganda': 1, 'sway': 1, 'coupled': 1, 'ta...",news
1,1,<speaker_1> strickland chargers owner dean spa...,strickland chargers owner dean spanos and gol...,"[strickland, chargers, owner, dean, spanos, an...","{'utkevinacee': 1, 'acee': 1, 'spanos': 2, 'in...",nfl
2,2,<speaker_1> iniesta plays keepy uppy with one ...,iniesta plays keepy uppy with one leg man yo...,"[iniesta, plays, keepy, uppy, with, one, leg, ...","{'one': 2, 'let': 1, 'find': 1, 'youtube': 1, ...",soccer
3,3,<speaker_1> chappie trailer #1 <number> hugh j...,chappie trailer hugh jackman sci fi comedy...,"[chappie, trailer, hugh, jackman, sci, fi, com...","{'described': 1, 'anyone': 1, 'youtu': 1, 'tow...",movies
4,4,<speaker_1> why the church of satan may get to...,why the church of satan may get to open your ...,"[why, the, church, of, satan, may, get, to, op...","{'one': 1, 'long': 1, 'religious': 1, 'worship...",politics


# Now, Learn the Naive Bayes Model

In [7]:
train_input.groupby("output").size()
#Size of each of the groups

output
hockey       20861
movies       22409
nba          18422
news         21057
nfl          20106
politics     19694
soccer       21363
worldnews    21088
dtype: int64

In [8]:
##Recursive algorithm to efficiently sum a list of counters
def mergeSum(Counter_list):
    if len(Counter_list) < 2:
        return Counter_list.iloc[0]
    
    else: 
        mid = len(Counter_list)//2

        lefthalf = mergeSum(Counter_list[:mid])
        righthalf = mergeSum(Counter_list[mid:])

        my_sum = lefthalf + righthalf
        return my_sum

In [9]:
%%time
#Total word Counter
total_counter = mergeSum(train_input["words_count"])

CPU times: user 1min 11s, sys: 2.22 s, total: 1min 14s
Wall time: 1min 15s


In [10]:
%%time 
#Word Counter for each group
groups_counter = train_input.groupby("output")["words_count"].apply(lambda x: mergeSum(x))

CPU times: user 1min 13s, sys: 2.15 s, total: 1min 15s
Wall time: 1min 16s


In [12]:
len(total_counter)

107517

In [13]:
total_wordcount = sum(total_counter.values())
#Total number of words (not unique)
total_wordcount

6863865

In [14]:
%%time 
total_word_freq = Counter({k:v/total_wordcount for k,v in total_counter.items()})

CPU times: user 103 ms, sys: 396 ms, total: 499 ms
Wall time: 585 ms


In [15]:
#%%time
group_word_freq = {}
groups_freq = groups_counter
for label in train_input["output"].unique():
    group_wordcount = sum(groups_counter[label])
    group_word_freq[label] = Counter({k:v/group_wordcount for k,v in groups_freq[label].items()})

In [16]:
%%time
def get_value_dict(my_dict,key):
    if key in my_dict: 
        return my_dict[key]
    else:
        return 0
group_word_laplace = {}
total_words = len(total_counter)
for label in train_input["output"].unique():
    temp_group = groups_counter[label]
    group_wordcount = sum(groups_counter[label])
    #Conditional probability calculation with laplace smoothing
    group_word_laplace[label] = Counter({k:(get_value_dict(temp_group,k)+1)/(v + total_words)for k,v in total_counter.items()})

CPU times: user 4.9 s, sys: 75.5 ms, total: 4.97 s
Wall time: 5.05 s


In [17]:
%%time
#Build a counter for the IDFs of each word in our corpus
words_doc_list = [list(counter.keys()) for counter in list(train_input["words_count"].values)]
words_doc_counter = Counter(chain.from_iterable(set(x) for x in words_doc_list))
num_docs = len(train_input)
total_words_idf = Counter({k: np.log(num_docs / words_doc_counter[k]) for k in total_counter.keys()})

CPU times: user 3.81 s, sys: 151 ms, total: 3.96 s
Wall time: 3.96 s


In [18]:
%%time
def get_value_dict(my_dict,key):
    if key in my_dict: 
        return my_dict[key]
    else:
        return 0
group_word_idf = {}
total_words = len(total_counter)
for label in train_input["output"].unique():
    temp_group = groups_counter[label]
    group_wordcount = sum(groups_counter[label])
    #Conditional probability calculation with laplace smoothing
    group_word_idf[label] = Counter({k:((get_value_dict(temp_group,k) + 1) / (v + total_words) * total_words_idf[k] )for k,v in total_counter.items()})

CPU times: user 5.17 s, sys: 58 ms, total: 5.23 s
Wall time: 5.24 s


# Predict on New Data

In [19]:
test_input = pd.read_csv('test_input.csv')
print(len(test_input))

53218


In [20]:
%%time
test_input = generate_features(test_input)

CPU times: user 6.34 s, sys: 228 ms, total: 6.56 s
Wall time: 6.59 s


In [21]:
test_input.head(2)

Unnamed: 0,id,conversation,text,words,words_count
0,0,<speaker_1> philadelphia is decriminalizing ma...,philadelphia is decriminalizing marijuana pos...,"[philadelphia, is, decriminalizing, marijuana,...","{'vice': 1, 'congress': 1, 'sway': 1, 'hopes':..."
1,1,<speaker_1> david cameron pushes for repeal of...,david cameron pushes for repeal of u k s hu...,"[david, cameron, pushes, for, repeal, of, u, k...","{'harper': 1, 'rights': 1, 'k': 1, 'australia'..."


In [22]:
#Now, create a function to predict class for each text snippet 
categories = train_input["output"].unique()
class_priors = {}
for category in categories:
    class_priors[category] = train_input.groupby("output").size()[category] / len(train_input)
    
total_words = len(total_counter)

def get_conditional(word, category): 
    #function that gets around cases where we haven't seen the word before
    if word in group_word_laplace[category]:
        return group_word_laplace[category][word]
    else: 
        return (1 / total_words)
    
def get_conditional_idf(word, category): 
    #function that gets around cases where we haven't seen the word before
    if word in group_word_idf[category]:
        return group_word_idf[category][word]
    else: 
        return (1 / total_words)

def predict_class(word_counter):
    classes_prob = {}
    for category in categories:
        classes_prob[category] = 1
        for k, v in word_counter.items():
            classes_prob[category] *= (get_conditional(k,category) ** v)
        classes_prob[category] *= class_priors[category]
        #update with the prior class probability 
    return max(classes_prob, key = classes_prob.get) 

def predict_class_idf(word_counter):
    classes_prob = {}
    for category in categories:
        classes_prob[category] = 1
        for k, v in word_counter.items():
            classes_prob[category] *= (get_conditional_idf(k,category) ** v)
        classes_prob[category] *= class_priors[category]
        #update with the prior class probability 
    return max(classes_prob, key = classes_prob.get)  

def predict_class_dict(word_counter):
    classes_prob = {}
    for category in categories:
        classes_prob[category] = 1
        for k, v in word_counter.items():
            classes_prob[category] *= (get_conditional(k,category) ** v)
        classes_prob[category] *= class_priors[category]
        #update with the prior class probability 
    return classes_prob

In [23]:
predict_class_dict(test_input["words_count"][100])

{'hockey': 2.849403069937408e-157,
 'movies': 9.3257795979085874e-156,
 'nba': 2.9819659321157092e-160,
 'news': 3.971654743780464e-156,
 'nfl': 4.541527357186871e-156,
 'politics': 6.2294448461937019e-163,
 'soccer': 1.2404713992481133e-142,
 'worldnews': 2.2744387194795351e-158}

In [24]:
%%time
test_output_series = test_input["words_count"].apply(lambda x: predict_class(x))

CPU times: user 17.4 s, sys: 77.7 ms, total: 17.4 s
Wall time: 17.5 s


In [25]:
%%time
test_output_series_idf = test_input["words_count"].apply(lambda x: predict_class_idf(x))

CPU times: user 17.6 s, sys: 37.3 ms, total: 17.6 s
Wall time: 17.7 s


In [28]:
len(test_input) - sum(test_output_series == test_output_series_idf)

199

In [27]:
test_output = pd.DataFrame(test_input["id"])
test_output["category"] = test_output_series
test_output.to_csv("naive_bayes_prediction.csv", index = False)