## Importing Important Libraries

In [37]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

## Importing Dataset

In [2]:
data1 = pd.read_csv('SalaryData_Test.csv',sep=',')
data1.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [3]:
data2 = pd.read_csv('SalaryData_Train.csv',sep=',')
data2.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data3 = pd.concat([data1,data2])
data3.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


## Data Understanding

In [5]:
data3.shape

(45221, 14)

In [6]:
data1.shape

(15060, 14)

In [7]:
data2.shape

(30161, 14)

In [8]:
data3.isnull().sum()

age              0
workclass        0
education        0
educationno      0
maritalstatus    0
occupation       0
relationship     0
race             0
sex              0
capitalgain      0
capitalloss      0
hoursperweek     0
native           0
Salary           0
dtype: int64

In [9]:
data3.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

In [10]:
data3.education.nunique()

16

In [11]:
data3.educationno.unique()

array([ 7,  9, 12, 10,  6, 15,  4, 13, 14,  3, 11,  5, 16,  8,  2,  1],
      dtype=int64)

In [12]:
data3.education.unique()

array([' 11th', ' HS-grad', ' Assoc-acdm', ' Some-college', ' 10th',
       ' Prof-school', ' 7th-8th', ' Bachelors', ' Masters', ' 5th-6th',
       ' Assoc-voc', ' 9th', ' Doctorate', ' 12th', ' 1st-4th',
       ' Preschool'], dtype=object)

In [13]:
data3.education.nunique()

16

### Note: For education coulmn, there is 'educationno' column provided with numerical transformation. We can keep the 'educationno' column and drop 'education' column. Else we can drop 'educationno' column and transform our 'education' column using label encoder.

## 4. Data Preparation

In [16]:
le = LabelEncoder()
data3['workclass'] = le.fit_transform(data3['workclass'])
data3['education'] = le.fit_transform(data3['education'])
data3['maritalstatus'] = le.fit_transform(data3['maritalstatus'])
data3['occupation'] = le.fit_transform(data3['occupation'])
data3['relationship'] = le.fit_transform(data3['relationship'])
data3['race'] = le.fit_transform(data3['race'])
data3['sex'] = le.fit_transform(data3['sex'])
data3['native'] = le.fit_transform(data3['native'])
data3['Salary'] = le.fit_transform(data3['Salary'])

In [17]:
del data3['educationno']

In [18]:
data3.dtypes

age              int64
workclass        int32
education        int32
maritalstatus    int32
occupation       int32
relationship     int32
race             int32
sex              int32
capitalgain      int64
capitalloss      int64
hoursperweek     int64
native           int32
Salary           int32
dtype: object

In [19]:
data3.head(10)

Unnamed: 0,age,workclass,education,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,2,1,4,6,3,2,1,0,0,40,37,0
1,38,2,11,2,4,0,4,1,0,0,50,37,0
2,28,1,7,2,10,0,4,1,0,0,40,37,1
3,44,2,15,2,6,0,2,1,7688,0,40,37,1
4,34,2,0,4,7,1,4,1,0,0,30,37,0
5,63,4,14,2,9,0,4,1,3103,0,32,37,1
6,24,2,15,4,7,4,4,0,0,0,40,37,0
7,55,2,5,2,2,0,4,1,0,0,10,37,0
8,65,2,11,2,6,0,4,1,6418,0,40,37,1
9,36,0,9,2,0,0,4,1,0,0,40,37,0


## 4. Model Building

In [20]:
X = data3.drop('Salary',axis= 1)
Y = data3[['Salary']]

In [23]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.33,random_state = 12,stratify=Y)

In [24]:
print(X_train.shape,Y_train.shape)

(30298, 12) (30298, 1)


In [25]:
print(X_test.shape,Y_test.shape)

(14923, 12) (14923, 1)


## 5. Model Training  using Gaussian Naive bayes

### Gaussian: It is used when out predictors are continous. Also it is used in classification algorithum

In [27]:
gnb = GaussianNB()
gnb.fit(X_train,Y_train)

  return f(*args, **kwargs)


GaussianNB()

## 6. Model Testing 

In [28]:
y_pred_train = gnb.predict(X_train)

In [29]:
y_pred_test = gnb.predict(X_test)

## 7. Model Evaluation 

In [31]:
accuracy_score(Y_train,y_pred_train)

0.7902831870090435

In [32]:
accuracy_score(Y_test,y_pred_test)

0.7916638745560544

In [34]:
print('confusion matrix : \n',confusion_matrix(Y_test,y_pred_test))

confusion matrix : 
 [[10663   561]
 [ 2548  1151]]


In [36]:
print(classification_report(Y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.81      0.95      0.87     11224
           1       0.67      0.31      0.43      3699

    accuracy                           0.79     14923
   macro avg       0.74      0.63      0.65     14923
weighted avg       0.77      0.79      0.76     14923



##  Model Building,Testing and Evaluation using Multinomial Naive bayes

### It is used for discreate counts and mostly used in document classification problem. 

In [38]:
mnb = MultinomialNB()
mnb.fit(X_train,Y_train)

  return f(*args, **kwargs)


MultinomialNB()

In [39]:
y_pred_train1 = mnb.predict(X_train)
y_pred_test1  = mnb.predict(X_test)

In [41]:
accuracy_score(Y_train,y_pred_train1)

0.773516403723018

In [42]:
accuracy_score(Y_test,y_pred_test1)

0.7737720297527307

## CONCLUSION

As mentioned previously, when the predictors are continous, Gaussian Naive bayes is the best choice which is proved by the accuracy we got for the model.The accuracy of Gaussian Naive bayes is more than the accuracy of Multinomial Naive bayes.Hence, we will go with that model. 