<h3>Importing Libraries</h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

<h3>Data Cleaning</h3>

<h4>Importing Data</h4

In [2]:
data=pd.read_csv('./onlinefoods.csv')

In [3]:
data.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


In [4]:
data.drop('Unnamed: 12',axis=1,inplace=True)

In [5]:
data.describe()

Unnamed: 0,Age,Family size,latitude,longitude,Pin code
count,388.0,388.0,388.0,388.0,388.0
mean,24.628866,3.280928,12.972058,77.60016,560040.113402
std,2.975593,1.351025,0.044489,0.051354,31.399609
min,18.0,1.0,12.8652,77.4842,560001.0
25%,23.0,2.0,12.9369,77.565275,560010.75
50%,24.0,3.0,12.977,77.5921,560033.5
75%,26.0,4.0,12.997025,77.6309,560068.0
max,33.0,6.0,13.102,77.7582,560109.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Pin code                    388 non-null    int64  
 10  Output                      388 non-null    object 
 11  Feedback                    388 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 36.5+ KB


In [7]:
data.shape

(388, 12)

<h4>Checking unique values of the attributes</h4>

In [8]:
data.Gender.unique()

array(['Female', 'Male'], dtype=object)

In [9]:
data['Marital Status'].unique()

array(['Single', 'Married', 'Prefer not to say'], dtype=object)

In [10]:
data['Occupation'].unique()

array(['Student', 'Employee', 'Self Employeed', 'House wife'],
      dtype=object)

In [11]:
data['Educational Qualifications'].unique()

array(['Post Graduate', 'Graduate', 'Ph.D', 'Uneducated', 'School'],
      dtype=object)

In [12]:
data['Monthly Income'].unique()

array(['No Income', 'Below Rs.10000', 'More than 50000', '10001 to 25000',
       '25001 to 50000'], dtype=object)

In [13]:
data['Family size'].unique()

array([4, 3, 6, 2, 5, 1])

In [14]:
data['Output'].unique()

array(['Yes', 'No'], dtype=object)

In [15]:
data['Feedback'].unique()

array(['Positive', 'Negative '], dtype=object)

<h4>Checking null values</h4>

In [16]:
data.isnull().sum()    # The dataset does'nt contain any any null values

Age                           0
Gender                        0
Marital Status                0
Occupation                    0
Monthly Income                0
Educational Qualifications    0
Family size                   0
latitude                      0
longitude                     0
Pin code                      0
Output                        0
Feedback                      0
dtype: int64

<h4>Checking duplicates</h4>

In [17]:
data[data.duplicated(keep='last')]

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.9850,77.5533,560010,Yes,Positive
5,27,Female,Married,Employee,More than 50000,Post Graduate,2,12.9299,77.6848,560103,Yes,Positive
6,22,Male,Single,Student,No Income,Graduate,3,12.9770,77.5773,560009,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...
333,29,Male,Married,Employee,25001 to 50000,Graduate,4,12.9261,77.6221,560034,No,Negative
334,23,Female,Single,Student,No Income,Graduate,1,12.9770,77.5773,560009,Yes,Positive
335,25,Male,Single,Self Employeed,More than 50000,Graduate,2,12.9783,77.6408,560038,Yes,Positive
337,27,Male,Married,Self Employeed,25001 to 50000,Graduate,6,12.9217,77.5936,560041,No,Positive


<p><b>Note:</b> The given data contains many duplicated values.It is better not to remove the duplicated values because it leads to the less training data during data splitting and hence affects the Accuracy of the model</p>

<h4>Replacing str values of Monthly Income to float values</h4>

In [18]:
data=data.replace('No Income',0)

In [19]:
data=data.replace('Rs.','',regex=True)

In [20]:
data=data.replace('Below ','',regex=True)

In [21]:
data=data.replace(' to ',',',regex=True)

In [22]:
data=data.replace('10000',10000,regex=True)

In [23]:
data=data.replace('More than ','',regex=True)

In [24]:
data=data.replace('25001,50000',37500,regex=True)    # Taking average value

In [25]:
data=data.replace('10001,25000',17500,regex=True)    # Taking average value

In [26]:
data=data.replace('50000',50000,regex=True)

  data=data.replace('50000',50000,regex=True)


In [27]:
data['Monthly Income'].unique()

array([    0, 10000, 50000, 17500, 37500])

In [28]:
data['Monthly Income']=data['Monthly Income'].astype('float')

<h3>Data Preprocessing</h3>

<h4>Splitting the data into dependent and independent variables</h4>

In [29]:
X=data.iloc[:,0:7].values
y=data.iloc[:,11].values

<h4>Encoding the Categorical Data</h4>

In [30]:
e1=LabelEncoder()
y=e1.fit_transform(y)

e2=LabelEncoder()
X[:,1]=e2.fit_transform(X[:,1])

e3=LabelEncoder()
X[:,2]=e3.fit_transform(X[:,2])

e4=LabelEncoder()
X[:,3]=e4.fit_transform(X[:,3])

e5=LabelEncoder()
X[:,5]=e5.fit_transform(X[:,5])

<h4>Splitting the dataset into training and testing data</h4>

In [31]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,test_size=0.2,shuffle=True,random_state=0)

<h3>Training the model</h3>

In [32]:
clf=GaussianNB()

clf.fit(X_train,y_train)

In [33]:
y_pred=clf.predict(X_test)

In [34]:
np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),axis=1)

array([[1, 1],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [0,

<h3>Evaluating Metrics of the model</h3>

In [35]:
confusion_matrix(y_true=y_test,y_pred=y_pred)

array([[ 3, 11],
       [ 7, 57]])

In [36]:
accuracy_score(y_true=y_test,y_pred=y_pred)

0.7692307692307693

In [37]:
print(classification_report(y_true=y_test,y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.30      0.21      0.25        14
           1       0.84      0.89      0.86        64

    accuracy                           0.77        78
   macro avg       0.57      0.55      0.56        78
weighted avg       0.74      0.77      0.75        78

