#### Naive Bayes Classifier - Implementation from scratch 
The notbook contains the code to implement the Naive Bayes Classifier from Scratch.

### Importing required libraries

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re

### Importing the dataset

In [13]:
inp_dataset = pd.read_csv("C:\\Ujjwal\\Analytics\\Datasets\\News Classification\\News_train.csv")
inp_dataset.head(2)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business


### Cleaning the articles to remove the unwanted characters

In [14]:
def text_clean(text_series):
    clean_1 = text_series.str.replace(r"[^a-zA-Z\s]","")
    clean_2 = clean_1.str.replace(r"\s+", " ")
    return clean_2

In [15]:
inp_dataset["Text_Clean"] = text_clean(inp_dataset["Text"])

In [53]:
Y = inp_dataset['Category']
train_x, test_x, train_y, test_y  = train_test_split(inp_dataset,Y,random_state = 8)
train_x.reset_index(inplace = True, drop = True)

### Creating Bag of Words

In [55]:
Cnt_Vec = CountVectorizer()
BOW = Cnt_Vec.fit_transform(train_x["Text_Clean"]).toarray()
BOW_Df = pd.DataFrame(BOW, columns=Cnt_Vec.get_feature_names())

### Merging Dataframes & Creating training and test set

In [56]:
inp_dataset_final = pd.merge(train_x, BOW_Df, left_index=True, right_index=True, how = "left")

### Consolidating Bag of Words on the basis of News Categories

In [58]:
cons_dataset = inp_dataset_final.groupby("Category").agg({col:"count" if col == "ArticleId" else "sum" for col in inp_dataset_final.columns})
cols = [col for col in cons_dataset.columns if col not in ["Text", "Category", "Text_Clean"]]
cons_dataset_updated = cons_dataset[cols].copy()
cons_dataset_updated.reset_index(inplace=True)
cons_dataset_updated["sum"] = cons_dataset_updated.iloc[:,2:].apply(lambda x: np.sum(x), axis=1)

### Creating Probability Table

In [59]:
alpha = 1
prob_table = pd.DataFrame()
prob_table["Category"] = cons_dataset_updated["Category"]
prob_table["p_C"] = cons_dataset_updated["ArticleId"]/cons_dataset_updated.shape[0]
cols = [col for col in cons_dataset_updated.columns if col not in ["Category", "ArticleId", "sum"]]
no_of_cols = len(cols)
for col in cols:
    prob_table[col] = (cons_dataset_updated[col]+alpha)/(cons_dataset_updated["sum"] + (alpha*no_of_cols))

### Creating a word tokenizer

In [30]:
def wt(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+"," ", text)
    list_of_words = text.split(" ")
    return list_of_words

### Testing the classification

In [60]:
test_x.reset_index(drop=True, inplace=True)

In [61]:
correct = 0
incorrect = 0
for i in range(0,test_x.shape[0]-200):
    text = test_x.loc[i,"Text_Clean"]
    prob = pd.DataFrame()
    prob["Category"] = prob_table["Category"]
    prob["prob"] = prob_table['p_C']
    for val in wt(text):
        if val in prob_table.columns:
            prob["prob"] = prob["prob"] * prob_table[val] * 1000
        else:
            prob["prob"] = prob["prob"] * 1
    prob["probability"] = prob["prob"]/prob["prob"].sum()
    prob.sort_values("probability",ascending = False, inplace=True)
    if test_x.loc[i,'Category'] == prob.iloc[0,0]:
        correct += 1
    else:
        incorrect +=1
print(correct,"-",incorrect)

165 - 8
