In [1]:
from sklearn import model_selection, naive_bayes, metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [2]:
dataPath = '../data/'
trainFilePath = dataPath + 'train.csv'

In [3]:
trainDF = pd.read_csv(trainFilePath)
trainDF.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [4]:
# Split 80% - 20% train validation
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['question_text'], trainDF['target'], test_size=0.2, random_state=42)

train_y = train_y.values
valid_y = valid_y.values

In [5]:
trainDF.shape, train_x.shape, valid_x.shape

((1306122, 3), (1044897,), (261225,))

In [6]:
#Analysing Positive and Negative Samples in train.csv
#Will Help us understand the accuracy if we predict all output as 0
trainDF[['question_text','target']].groupby('target').count()

Unnamed: 0_level_0,question_text
target,Unnamed: 1_level_1
0,1225312
1,80810


In [7]:
#Negative ratio for the entire train data
positiveCount = 80810
negativeCount = 1225312
negativeRatio = negativeCount/ (negativeCount + positiveCount)
negativeRatio

0.9381298224821265

In [8]:
#Negative ratio for the validation set. We get accuracy = 0.939 if we predict 0 for all inputs
1 - sum(valid_y)/len(valid_y)

0.9393013685520145

In [9]:
# Naive Bayes for different sized vocabulary controlled by xtrain_tfidf and xvalid_tfidf
# train_y and valid_y remain the same
def naiveBayes(xtrain_tfidf, xvalid_tfidf):
    classifier = naive_bayes.MultinomialNB()
    classifier.fit(xtrain_tfidf, train_y)
    predictions = classifier.predict(xvalid_tfidf)
    accuracy = metrics.accuracy_score(predictions, valid_y)
    f1 = metrics.f1_score(predictions, valid_y)
    return {'accuracy':accuracy, 'f1': f1}

In [10]:
#tf-idf on all the words
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}')
tfidf_vect.fit(trainDF['question_text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

vocabularySize = xtrain_tfidf.shape[1]
print("Total words in Vocabulary: ", vocabularySize)

print(naiveBayes(xtrain_tfidf, xvalid_tfidf))

Total words in Vocabulary:  195390
{'f1': 0.10676946440044391, 'accuracy': 0.9414565987175806}


In [11]:
def naiveBayesVocabPercentage(percentage):
    # tf-idf on percentage% of the vocabulary
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', \
                                 max_features = int((percentage/100)*vocabularySize))
    tfidf_vect.fit(trainDF['question_text'])
    xtrain_tfidf =  tfidf_vect.transform(train_x)
    xvalid_tfidf =  tfidf_vect.transform(valid_x)
    return(naiveBayes(xtrain_tfidf, xvalid_tfidf))

In [12]:
percentageList = [70,75,80,85,90,95]
f1List = []
accuracyList = []
for percentage in percentageList:
    f1List.append(naiveBayesVocabPercentage(percentage)['f1'])
    accuracyList.append(naiveBayesVocabPercentage(percentage)['accuracy'])
f1List, accuracyList

([0.1820991060589339,
  0.16553034536716218,
  0.1505656108597285,
  0.1376434976297904,
  0.12552590628782204,
  0.11592689295039164],
 [0.9432596420710115,
  0.9428385491434587,
  0.9425093310364628,
  0.942199253517083,
  0.9419159728203655,
  0.9416709732988803])