<a href="https://colab.research.google.com/github/deanhoperobertson/NLP-Text-Classifiation-/blob/master/Naive_Bayes_(Model_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Naive Bayes

In [0]:
import urllib.request
import pandas as pd
import urllib
from sklearn import model_selection, preprocessing

#Vizualization
from matplotlib import pyplot
%matplotlib inline

#Feature Engineering
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn import model_selection, preprocessing

#keras
from keras.preprocessing.text import Tokenizer

#Model Building
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

In [0]:
#import data from my github repo
url = "https://raw.githubusercontent.com/deanhoperobertson/NLP-Text-Classifiation-/master/corpus.txt"
data = urllib.request.urlopen(url).read()
data = data.decode('utf-8')

In [0]:
labels, texts = [], []
for i, line in enumerate(str(data).split("\n")):
    content = line.replace("\\","").split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

In [0]:
#split into train and test set (default is 25%)
#set seed for consistency
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'], test_size=0.25, random_state=1)

# encode target labels into intergers between 0 - no.classes
# label 2 = 1
# label 1 = 0
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

### Count Vectorization

In [0]:
# instantiate a count vectorizer object to tokenize on a word level
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', strip_accents ="ascii", stop_words=None)
count_vect.fit(trainDF['text'])

#see the unique words being tokenized 
count_vect.get_feature_names()[1:10]

#create a document term matrix using the 
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

### TDIF Vectorization

In [0]:
# vectorizing on a word level
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

### N-Gram Vectorization

In [0]:

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

## 1. Build Model

In [0]:
#Define the model build as a function

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

### 1.1 Naive Bayes

In [35]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: %s %s" %(accuracy,"%"))


# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF:  %s %s" %(accuracy,"%"))

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors:  %s %s" %(accuracy,"%"))


NB, Count Vectors: 0.8336 %
NB, WordLevel TF-IDF:  0.8388 %
NB, N-Gram Vectors:  0.8412 %
