Constructing a Naive Bayes Classifier
 
Combine all the preprocessing techniques and create a dictionary of words and each word’s count in training data.

Calculate probability for each word in a text and filter the words which have a probability less than threshold probability. Words with probability less than threshold probability are irrelevant.
Then for each word in the dictionary, create a probability of that word being in insincere questions and its probability insincere questions. Then finding the conditional probability to use in naive Bayes classifier.
Prediction using conditional probabilities.

In [53]:
import numpy as np 
import pandas as pd
import os

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [54]:
train.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [55]:
print ('Shape of train ',train.shape)
print ('Shape of test ',test.shape)

Shape of train  (1306122, 3)
Shape of test  (375806, 2)


In [56]:
print ('Taking a look at Sincere Questions')
train.loc[train['target'] == 0].sample(5)['question_text']

print ('Taking a look at Insincere Questions')
train.loc[train['target'] == 1].sample(5)['question_text']

Taking a look at Sincere Questions
Taking a look at Insincere Questions


471755    Why is America so racist compared to western E...
748477      What kind of moron designed the flag of Cyprus?
104709    Why do feminists oppose men's rights groups be...
651101          Do black people hate white people nowadays?
432337    Can Indian Muslims with support of neighbourin...
Name: question_text, dtype: object

In [57]:
samp = train.sample(1)
sentence = samp.iloc[0]['question_text']
print (sentence)

What are the limitations of private security? Do they have the ability to pull people over on a public road?


In [58]:
import re
sentence = re.sub(r'\d+','',sentence)
print ('Sentence After removing numbers\n',sentence)

#Removing Punctuations in a string.

import string
sentence = sentence.translate(sentence.maketrans("","",string.punctuation))
print ('Sentence After Removing Punctuations\n',sentence)

Sentence After removing numbers
 What are the limitations of private security? Do they have the ability to pull people over on a public road?
Sentence After Removing Punctuations
 What are the limitations of private security Do they have the ability to pull people over on a public road


In [59]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))
words_in_sentence = list(set(sentence.split(' ')) - stop_words)
print (words_in_sentence)

['road', 'security', 'private', 'public', 'limitations', 'people', 'ability', 'Do', 'pull', 'What']


[nltk_data] Downloading package punkt to /Users/Admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
from nltk.stem import PorterStemmer
nltk.download('wordnet')
stemmer= PorterStemmer()
for i,word in enumerate(words_in_sentence):
    words_in_sentence[i] = stemmer.stem(word)
print (words_in_sentence)    

#Lemmatization of Words
#Lemmatisation is the process of grouping together the different inflected forms of a word so they can be analysed as a single item. Ex: dogs -> dog. I am not clear with difference between lemmatization and stemming. In most of the tutorials, I found them both and I could not understand the clear difference between the two.

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
words = []
for i,word in enumerate(words_in_sentence):
    words_in_sentence[i] = lemmatizer.lemmatize(word)
print (words_in_sentence)

['road', 'secur', 'privat', 'public', 'limit', 'peopl', 'abil', 'do', 'pull', 'what']
['road', 'secur', 'privat', 'public', 'limit', 'peopl', 'abil', 'do', 'pull', 'what']


[nltk_data] Downloading package wordnet to /Users/Admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(train, test_size=0.2)

In [62]:
word_count = {}
word_count_sincere = {}
word_count_insincere = {}
sincere  = 0
insincere = 0 

import re
import string
import nltk
stop_words = set(nltk.corpus.stopwords.words('english'))
from nltk.stem import PorterStemmer
stemmer= PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [63]:
row_count = train.shape[0]
for row in range(0,row_count):
    insincere += train.iloc[row]['target']
    sincere += (1 - train.iloc[row]['target'])
    sentence = train.iloc[row]['question_text']
    sentence = re.sub(r'\d+','',sentence)
    sentence = sentence.translate(sentence.maketrans("","",string.punctuation))
    words_in_sentence = list(set(sentence.split(' ')) - stop_words)
    for index,word in enumerate(words_in_sentence):
        word = stemmer.stem(word)
        words_in_sentence[index] = lemmatizer.lemmatize(word)
    for word in words_in_sentence:
        if train.iloc[row]['target'] == 0:   #Sincere Words
            if word in word_count_sincere.keys():
                word_count_sincere[word]+=1
            else:
                word_count_sincere[word] = 1
        elif train.iloc[row]['target'] == 1: #Insincere Words
            if word in word_count_insincere.keys():
                word_count_insincere[word]+=1
            else:
                word_count_insincere[word] = 1
        if word in word_count.keys():        #For all words. I use this to compute probability.
            word_count[word]+=1
        else:
            word_count[word]=1

print('Done')

Done


In [64]:
word_probability = {}
total_words = 0
for i in word_count:
    total_words += word_count[i]
for i in word_count:
    word_probability[i] = word_count[i] / total_words

#Eliminating words which are insignificant. Insignificant words are words which have a probability of occurence less than 0.0001.
print ('Total words ',len(word_probability))
print ('Minimum probability ',min (word_probability.values()))
threshold_p = 0.0001
for i in list(word_probability):
    if word_probability[i] < threshold_p:
        del word_probability[i]
        if i in list(word_count_sincere):   #list(dict) return it;s key elements
            del word_count_sincere[i]
        if i in list(word_count_insincere):  
            del word_count_insincere[i]
print ('Total words ',len(word_probability))

Total words  163880
Minimum probability  1.2684068024149451e-07
Total words  1583


In [65]:
total_sincere_words = sum(word_count_sincere.values())
cp_sincere = {}  #Conditional Probability
for i in list(word_count_sincere):
    cp_sincere[i] = word_count_sincere[i] / total_sincere_words

total_insincere_words = sum(word_count_insincere.values())
cp_insincere = {}  #Conditional Probability
for i in list(word_count_insincere):
    cp_insincere[i] = word_count_insincere[i] / total_insincere_words


In [67]:
row_count = test.shape[0]

p_insincere = insincere / (sincere + insincere)
p_sincere = sincere / (sincere + insincere)
accuracy = 0

for row in range(0,row_count):
    sentence = test.iloc[row]['question_text']
    target = test.iloc[row]['target']
    sentence = re.sub(r'\d+','',sentence)
    sentence = sentence.translate(sentence.maketrans("","",string.punctuation))
    words_in_sentence = list(set(sentence.split(' ')) - stop_words)
    for index,word in enumerate(words_in_sentence):
        word = stemmer.stem(word)
        words_in_sentence[index] = lemmatizer.lemmatize(word)
    insincere_term = p_insincere
    sincere_term = p_sincere
    
    sincere_M = len(cp_sincere.keys())
    insincere_M = len(cp_insincere.keys())
    for word in words_in_sentence:
        if word not in cp_insincere.keys():
            insincere_M +=1
        if word not in cp_sincere.keys():
            sincere_M += 1
         
    for word in words_in_sentence:
        if word in cp_insincere.keys():
            insincere_term *= (cp_insincere[word] + (1/insincere_M))
        else:
            insincere_term *= (1/insincere_M)
        if word in cp_sincere.keys():
            sincere_term *= (cp_sincere[word] + (1/sincere_M))
        else:
            sincere_term *= (1/sincere_M)
        
    if insincere_term/(insincere_term + sincere_term) > 0.5:
        response = 1
    else:
        response = 0
    if target == response:
        accuracy += 1
    
print ('Accuracy is ',accuracy/row_count*100)

Accuracy is  94.16020671834625
