### Predictive modeling for Classification Project on OrderAgian

## OrderAgain Dataset 

In [1]:
# Data Mining
import pandas as pd
import numpy as np

# Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

# Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
raw = pd.read_csv("onlinefoods.csv")

In [3]:
raw.columns

Index(['Age', 'Gender', 'Marital Status', 'Occupation', 'Monthly Income',
       'Educational Qualifications', 'Family size', 'latitude', 'longitude',
       'Pin code', 'Output', 'Feedback'],
      dtype='object')

In [4]:
raw.head(2)

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive


**Basic Checks**

In [5]:
raw.shape

(388, 12)

In [6]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Pin code                    388 non-null    int64  
 10  Output                      388 non-null    object 
 11  Feedback                    388 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 36.5+ KB


### Handling Duplicates

* Convert categorical columns into lower case for duplicates check

### Note

* You need to check before data validation and after validation

In [7]:
raw = raw.drop_duplicates().reset_index(drop=True)

In [8]:
raw[raw.duplicated()]

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback


In [9]:
for i in raw.columns:
    if raw[i].dtype == object:
        raw[i] = raw[i].str.lower()

In [10]:
raw.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback
0,20,female,single,student,no income,post graduate,4,12.9766,77.5993,560001,yes,positive
1,24,female,single,student,below rs.10000,graduate,3,12.977,77.5773,560009,yes,positive
2,22,male,single,student,below rs.10000,post graduate,3,12.9551,77.6593,560017,yes,negative
3,22,female,single,student,no income,graduate,6,12.9473,77.5616,560019,yes,positive
4,22,male,single,student,below rs.10000,post graduate,4,12.985,77.5533,560010,yes,positive


### Checking Missing Values

In [11]:
raw.isnull().sum()

Age                           0
Gender                        0
Marital Status                0
Occupation                    0
Monthly Income                0
Educational Qualifications    0
Family size                   0
latitude                      0
longitude                     0
Pin code                      0
Output                        0
Feedback                      0
dtype: int64

### 1. Data validation & Cleaning

 #### Checking each and every column data
 
   * We are using string methods for this entire analysis
   
   * Here I am modifying all columns data

In [12]:
raw.rename(columns = {'Marital Status':'Marital_Status',
                      'Educational Qualifications':'Educational_Qualifications',
                      'Monthly Income':'Monthly_Income',
                      'Family size':'Family_size','Pin code':'Pin_code'},inplace=True)

In [13]:
raw.head()

Unnamed: 0,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_size,latitude,longitude,Pin_code,Output,Feedback
0,20,female,single,student,no income,post graduate,4,12.9766,77.5993,560001,yes,positive
1,24,female,single,student,below rs.10000,graduate,3,12.977,77.5773,560009,yes,positive
2,22,male,single,student,below rs.10000,post graduate,3,12.9551,77.6593,560017,yes,negative
3,22,female,single,student,no income,graduate,6,12.9473,77.5616,560019,yes,positive
4,22,male,single,student,below rs.10000,post graduate,4,12.985,77.5533,560010,yes,positive


* Data is Valid

* Replacing special characters

    * We will be using replace method in pandas
            * str.replace - Will be used to replace any character in string
            * replace - Will be used to replace entire string
            
    
    

### Python code

**Area**

    * We are adding Area column by using latitude and longitude columns 

In [14]:
raw['coordinates'] = raw['latitude'].apply(str)+ "," +raw['longitude'].apply(str)
raw.head()

Unnamed: 0,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_size,latitude,longitude,Pin_code,Output,Feedback,coordinates
0,20,female,single,student,no income,post graduate,4,12.9766,77.5993,560001,yes,positive,"12.9766,77.5993"
1,24,female,single,student,below rs.10000,graduate,3,12.977,77.5773,560009,yes,positive,"12.977,77.5773"
2,22,male,single,student,below rs.10000,post graduate,3,12.9551,77.6593,560017,yes,negative,"12.9551,77.6593"
3,22,female,single,student,no income,graduate,6,12.9473,77.5616,560019,yes,positive,"12.9473,77.5616"
4,22,male,single,student,below rs.10000,post graduate,4,12.985,77.5533,560010,yes,positive,"12.985,77.5533"


In [15]:
# Import the necessary libraries

import geopandas as gpd
import geopy
import geocoder
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [17]:
geolocator = Nominatim(user_agent="Nancy Amandi", timeout= 10)
rgeocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)
raw["location"] = raw["coordinates"].apply(rgeocode)

KeyboardInterrupt: 

In [16]:
raw.head(3)

Unnamed: 0,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_size,latitude,longitude,Pin_code,Output,Feedback,coordinates
0,20,female,single,student,no income,post graduate,4,12.9766,77.5993,560001,yes,positive,"12.9766,77.5993"
1,24,female,single,student,below rs.10000,graduate,3,12.977,77.5773,560009,yes,positive,"12.977,77.5773"
2,22,male,single,student,below rs.10000,post graduate,3,12.9551,77.6593,560017,yes,negative,"12.9551,77.6593"


In [17]:
def get_suburb(row):
    location = row["location"]
    
    try:
        return location.raw["address"]["suburb"]
    except KeyError:
        return " "

def get_city(row):
    location = row["location"]
    try:
        return location.raw["address"]["city"]
    except KeyError:
        return " "

def get_state(row):
    location = row["location"]
    try:
        return location.raw["address"]["state"]
    except KeyError:
        return " "

In [18]:
raw["city"] = raw.apply(get_suburb, axis=1)
raw["suburb"] = raw.apply(get_city, axis=1)
raw["state"] = raw.apply(get_state, axis=1)
raw.head(3)

KeyError: 'location'

### Deleting columns

In [19]:
Delete = ['latitude','longitude','coordinates','location','suburb','state']

In [20]:
raw = raw.drop(columns = Delete)

KeyError: "['location', 'suburb', 'state'] not found in axis"

In [23]:
raw.head()

Unnamed: 0,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_size,latitude,longitude,Pin_code,Output,Feedback,coordinates
0,20,female,single,student,no income,post graduate,4,12.9766,77.5993,560001,yes,positive,"12.9766,77.5993"
1,24,female,single,student,below rs.10000,graduate,3,12.977,77.5773,560009,yes,positive,"12.977,77.5773"
2,22,male,single,student,below rs.10000,post graduate,3,12.9551,77.6593,560017,yes,negative,"12.9551,77.6593"
3,22,female,single,student,no income,graduate,6,12.9473,77.5616,560019,yes,positive,"12.9473,77.5616"
4,22,male,single,student,below rs.10000,post graduate,4,12.985,77.5533,560010,yes,positive,"12.985,77.5533"


**Age**

In [None]:
raw.Age.unique(),raw.Age.nunique(),raw.Age.dtype

**Gender**

In [None]:
raw.Gender.unique(),raw.Gender.nunique(),raw.Gender.dtype

**Marital_Status**

In [None]:
raw.Marital_Status = raw.Marital_Status.str.replace(" ","_")

In [None]:
raw.Marital_Status.unique(),raw.Marital_Status.nunique(),raw.Marital_Status.dtype

**Occupation**

In [None]:
raw.Occupation = raw.Occupation.str.replace(" ","_")

In [None]:
raw.Occupation.unique(),raw.Occupation.nunique(),raw.Occupation.dtype

#### Monthly_Income

In [None]:
raw.Monthly_Income = raw.Monthly_Income.str.replace(" ","_").str.replace("_rs.","_")

In [None]:
raw.Monthly_Income.unique(),raw.Monthly_Income.nunique(),raw.Monthly_Income.dtype

In [None]:
raw.Monthly_Income.replace({"no_income":"no_income",
                            "below_10000":"low_income",
                            "10001_to_25000":"moderate_income",
                            "25001_to_50000":"high_income",
                            "more_than_50000":"very_high_income"},inplace = True)                           

In [None]:
raw.Monthly_Income.value_counts()

* Data is valid
* Replace the Special charcters and also replace the column data properly in the Monthly_Income.

**Educational_Qualifications**

In [None]:
raw.Educational_Qualifications = raw.Educational_Qualifications.str.replace(" ","_").str.replace(".","_")

In [None]:
raw.Educational_Qualifications.unique(),raw.Educational_Qualifications.nunique(),raw.Educational_Qualifications.dtype

**Family_size**

In [None]:
raw.Family_size.unique(),raw.Family_size.nunique(),raw.Family_size.dtype

**Pin_code**	

In [None]:
raw.Pin_code.unique(),raw.Pin_code.nunique(),raw.Pin_code.dtype

**Output** 

In [None]:
raw.Output.unique(),raw.Output.nunique(),raw.Output.dtype

**Feedback**

In [None]:
raw.Feedback = raw.Feedback.str.replace(" ","")

In [None]:
raw.Feedback.unique(),raw.Feedback.nunique(),raw.Feedback.dtype

**city to Area**

In [None]:
raw.rename(columns = {'city':'Area'},inplace = True)                   

In [None]:
raw.Area = raw.Area.str.replace(" ","_")

In [None]:
raw.Area.unique(),raw.Area.nunique(),raw.Area.dtype

### Validated Data

In [None]:
raw.head()

### 2. Data understanding & EDA(Exploratory Data Analysis) 

* We can get insights on dataset using Exploratory Data Analysis (EDA) methods

* EDA can be of two things, 
    - Statistics
        - Descriptive
        - Inferential
    - Visual Analysis

**Understanding Column Data Types in stats point of view**

In [None]:
raw.dtypes

**Numerical and Categorical data**

**Numerical and Categorical data**

Numerical(Quantitative) | Categorical(Qualitative)
--|---------------------
Age | Gender
Family_size|Marital_status
-|Occupation 
-|Monthly_income
-|Educationl_Qualifications
-|Output
-|Feedback 
-|Pin_code
-|Area

**When we have input and ouput variables mentioned**

* Taking important input columns to analysis with reference to the output variable is suggested for analysis.

**For this data we have output variable is Output**

* All the Columns are using for analysis

Analysis Need to be Studied

**Uni-variate** | **Bi_Variate** | **Multi-Variate**
------------|------------|---------------
**One Column Study**|**Anyone column with Output Column study**|**Combination of two or more columns with Output column study**
Descriptive|Descriptive|Descriptive
Visual Analysis|Visual Analysis|Visual Analysis

In [None]:
raw.head(5)

In [None]:
raw.columns, raw.dtypes

In [None]:
raw.Output.describe()

In [None]:
raw.Output.value_counts()

### 3.Balancing Data

 **OverSampling of Y data - Duplicates**

* Minority classes data will be added again and again to balance the count of majority class

In [None]:
raw.head()

We need to add 81 rows of 'n' class values to the data

In [None]:
nclasssample = pd.concat([raw[raw.Output=='no'], raw[raw.Output=='no'].sample(40)], axis=0).reset_index(drop=True)

In [None]:
nclasssample.head()

In [None]:
nclasssample1 = pd.concat([raw[raw.Output=='no'].sample(41)], axis=0).reset_index(drop=True)

In [None]:
nclasssample1.head()

In [None]:
balanced_data = pd.concat([raw, nclasssample, nclasssample1]).reset_index(drop=True)

In [None]:
balanced_data

In [None]:
balanced_data.Output.value_counts()

**Balancing duplicates**

In [None]:
balanced_data[balanced_data.duplicated()]

**Uni-Variate analysis on Categorical**

In [None]:
raw.Gender.value_counts(),raw.Gender.mode()

In [None]:
plt.figure(figsize=(3,2))
raw['Gender'].value_counts().plot(kind='pie')
plt.show()

* In this gender male has high value_counts 164. 

In [None]:
raw.Marital_Status.value_counts(),raw.Marital_Status.mode()

In [None]:
plt.figure(figsize=(3,2))
raw['Marital_Status'].value_counts().plot(kind='bar',color = 'lightblue')
plt.show()

* In marital_status singles have high value_counts with 189.  

In [None]:
raw.Occupation.value_counts(),raw.Occupation.mode()

In [None]:
plt.figure(figsize=(4,3))
raw['Occupation'].value_counts().plot(kind='pie')
plt.show()

* In Occupation Students are very high with 144. 

In [None]:
raw.Monthly_Income.value_counts(),raw.Monthly_Income.mode()

In [None]:
plt.figure(figsize=(4,3))
raw['Monthly_Income'].value_counts().plot(kind='bar', color = 'salmon')
plt.show()

* In monthly_income there are no_income has high value with 131.  

In [None]:
raw.Educational_Qualifications.value_counts(),raw.Educational_Qualifications.mode()

In [None]:
plt.figure(figsize=(5,4))
raw['Educational_Qualifications'].value_counts().plot(kind='pie')
plt.show()

* In Educational_Qualifications there are graduate are high with 126   

In [None]:
balanced_data.Output.value_counts(),balanced_data.Output.mode()

In [None]:
plt.figure(figsize=(2,3))
balanced_data['Output'].value_counts().plot(kind='bar',color = 'lightblue')
plt.show()

* In Output column we have balanced.

In [None]:
raw.Feedback.value_counts(),raw.Feedback.mode()

In [None]:
plt.figure(figsize=(2,3))
raw['Feedback'].value_counts().plot(kind='pie')
plt.show()

* In Feedback positive has high values with 231. 

In [None]:
raw.Area.value_counts(),raw.Area.mode()

In [None]:
plt.figure(figsize=(10,4))
raw['Area'].value_counts().plot(kind='bar',color = 'salmon')
plt.show()

* In Area column Gandhinagar has high value with 27.   

In [None]:
# Each Column characteristics

from termcolor import colored
from simple_colors import *

for i in raw.columns:
    if raw[i].dtype == 'object':
        print()
        print(green("Categorical Column Characteristics:",['bold']))
        print(green("Column Name:"),i)
        print(raw[i].unique(), ":", raw[i].nunique())
        print(raw[i].value_counts())
        print("Mode:",raw[i].mode())
    elif raw[i].dtype == 'int32' or 'int64' or 'float64':
        print()
        print(blue("Numerical Column Characteristics:",['bold']))
        print(blue("Column Name:"),i)
        print(raw[i].describe())
        print(raw[i].mode())

In [None]:
from simple_colors import *

for i in raw.columns:
    if raw[i].dtype == 'object' or raw[i].dtype == 'int32' or raw[i].dtype == 'int64':
        print()
        print(cyan("Categorical/Numeric Discrete Column:", ['bold']), i)
        print("=====================================================")
        print(green("Uni-Variate Descriptive Stats:", ['bold']))
        print(black("Classes:", ['bold']), raw[i].unique())
        print(black("Number of Classes:", ['bold']), raw[i].nunique())
        print(black("Value Counts of each class:", ['bold']),raw[i].value_counts())
        print(black('Class Percent:', ['bold']))
        print((raw[i].value_counts() / raw[i].value_counts().sum()) * 100)
        print("---------------------------------------------------")
        print(black("Mode Value:", ['bold']), raw[i].mode()[0])
        if raw[i].nunique() <= 10:
            print()
            print(magenta("Visual Analysis:", ['bold']))
            print("-----------------------------------------------------")
            plt.figure(figsize=(8, 3))
            raw[i].value_counts().plot(kind='pie')
            plt.show()

    elif raw[i].dtype != 'object':
        print()
        print(blue("Numerical Column:", ['bold']), i)
        print("=====================================================")
        print(green("Uni-Variate Descriptive Stats:", ['bold']))
        print(round(raw[i].describe(), ))
        print("-----------------------------------------------------")
        print(black("Skewness & Kurtosis:", ['bold']), raw[i].skew(), ",",
              raw[i].kurt())
        print()
        print(cyan("Visual Analysis:", ['bold']))
        print("-----------------------------------------------------")
        plt.figure(figsize=(8, 6))
        sns.distplot(raw[i])
        
        #data[i].plot(kind = 'density')
        
        plt.show()

**Uni-Variate analysis**

**On Numerical columns**

In [None]:
raw['Age'].describe(),raw['Age'].skew(),raw['Age'].kurtosis()

In [None]:
raw.Age.plot(kind='box',title='Boxplot of price',ylabel='Age',figsize=(6,3))

In [None]:
raw['Family_size'].describe(),raw['Family_size'].skew(),raw['Family_size'].kurtosis()

In [None]:
plt.figure(figsize=(6, 3))
sns.distplot(raw['Family_size'])
raw['Family_size'].plot(kind = 'density')
plt.show()

In [None]:
plt.figure(figsize=(6, 3))
raw.Family_size.plot(kind='density')

**Bi-Variate analysis on N-N**

**Data Study between two columns**

**Numerical to Numerical**

In [None]:
raw.head()

In [None]:
raw.corr()

In [None]:
sns.heatmap(raw.corr(),annot = True)

In [None]:
sns.pairplot(raw.iloc[0:500])

**Numerical to Categorical**

In [None]:
raw.head()

In [None]:
((raw.groupby("Monthly_Income")['Age'].mean()/len(raw))*100).sort_values(ascending  = False)

In [None]:
sns.barplot(y = raw.groupby('Monthly_Income')['Age'].mean().sort_values(ascending=False).index[0:5],
            x = raw.groupby('Monthly_Income')['Age'].mean().sort_values(ascending=False).values[0:5],orient ='h')

In [None]:
((raw.groupby("Occupation")['Family_size'].mean()/len(raw))*100).sort_values(ascending  = False)

In [None]:
sns.catplot(y='Occupation', x ='Family_size', data = raw, orient='h')

In [None]:
((raw.groupby('Area')['Age'].mean()/len(raw))*100).sort_values(ascending  = False)

In [None]:
sns.barplot(y = raw.groupby('Area')['Age'].mean().sort_values(ascending=False).index[0:26],
            x = raw.groupby('Area')['Age'].mean().sort_values(ascending=False).values[0:26],orient ='h')

**Categorical to Categorical**

In [None]:
raw.head(2)

In [None]:
pd.crosstab(raw.Occupation,raw.Output,margins=True)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(y='Output', hue='Occupation', data = raw)

In [None]:
pd.crosstab(raw.Output,raw.Monthly_Income,margins=True)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(y='Output', hue='Monthly_Income', data = raw)

In [None]:
pd.crosstab(raw.Output,raw.Educational_Qualifications, margins=True)

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='Output', hue='Educational_Qualifications', data = raw)

In [None]:
raw.head(5)

* We have done balacing to the output column Output with a method called over sampling by adding 149vales to the actual data. 
* Male have highest value counts in Gender column
* Single have highest value counts in Marital_Status column
* sn Occupation Students are very high with 144
* more number of people have no_income in terms of salary 
* graduated people are high where uneducated are only 2people
* Output column is balanced with same count
* In Family_size density id betwwen 2-4 with 3
* positive have highest value counts in Feedback column with 90%
* Gandhinagar have ordered more with 27 and Nagarabhavi have least ordered with 1
* Age column is with minimum of 18% and maximum of 33%
* Family_size have minimum of 1% and maximum of 6%
* There are outliers in age column
* In Monthly_Income there is high in very very_high_income and low in no_income
* Occupation Family_size in between these two there are people with no_income are high
* hero_hally has highest value in the age and area
* Student has highest count in Output
 in monthly the no income has highest count in the output column
 postgraduate has highest count in








###  Handling Missing Values & Outliers 

No missing values present in our data

In [None]:
raw.isnull().sum()

### Outlier detection

In [None]:
def outlier_detect(df):
    for i in df.describe().columns:
        print("Column:",i)
        print("------------------------------------------------")
        Q1 = df.describe().at['25%',i]
        Q3 = df.describe().at['75%',i]
        IQR = Q3 - Q1
        LTV = Q1 - 1.5 * IQR
        UTV = Q3 + 1.5 * IQR
        print("Lower Outliers:")
        print()
        lowerout = list(df[df[i]<LTV][i])
        lowerout.sort()
        print(lowerout)
        print()        
        print("Upper Outliers:")
        print()
        upperout = list(df[df[i]>UTV][i])
        upperout.sort()
        print(upperout) 
        print()

In [None]:
outlier_detect(raw)

In [None]:
def outlier_replacement(df):
    for i in df.describe().columns:
        Q1 = df.describe().at['25%',i]
        Q3 = df.describe().at['75%',i]
        IQR = Q3 - Q1
        LTV = Q1 - 1.5 * IQR
        UTV = Q3 + 1.5 * IQR
        
        # replacement vals
        median = df[i].median()
        low_bound = LTV
        high_bound = UTV
        fifth = df[i].quantile(0.05)
        ninetyfifth = df[i].quantile(0.95)
        
        # mask method is used to replace the values
        df[i] = df[i].mask(df[i]<LTV, low_bound) # replacing the outlier with ltv (25% value)
        df[i] = df[i].mask(df[i]>UTV, high_bound) # replacing the outlier with utv (75% value)

In [None]:
outlier_replacement(raw)

In [None]:
sns.boxplot(raw.Age)

### Predictive Modeling
    
* Above data will be given to a machine learning model, where the model will be trained on column data output with other columns data.     
    
* predictive modeling is sending data to a algorithm as input columns(x) along with one output column data (y), training y data with x
    
    model: y~x -> y = f(x)+e

### X & y
* This is the method where to identify according to business goal,output as y and x as input column 

In [None]:
x = raw.drop('Output', axis = 1) # input column
y = raw['Output']                # output column

In [None]:
x.head(2)

In [None]:
y.head(2)

### Train-Test Split

* Dividing Data (x,y) into train and test (Data Validation)

* for this we will use sklearn module
* we can go with 70,30 or 80,20 or 75,25 ratios

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25,random_state=42)

In [None]:
# Index Reset

xtrain = xtrain.reset_index(drop=True)
ytrain = ytrain.reset_index(drop=True)
xtest = xtest.reset_index(drop=True)
ytest = ytest.reset_index(drop=True)

In [None]:
xtrain.shape, xtest.shape

In [None]:
ytrain.shape, ytest.shape

In [None]:
xtrain.head(2)

In [None]:
xtest.head(2)

xtrain, ytrain data is used for the training predictive model

xtest is used for the predictions, ytest is used for the comparissions

###  Data Pre-Processing

**Categorical to Numerical (Encoding)**

Machine needs data in numeric format, so we need to convert categorical to numerical, while observing the number of classes , because it will increase the dimensionality if we are converting them to one hot encoding.

* Ordinal Encoding for ordinal
    - lets assume cat column data : platinum, gold, silver
    - ordinal - platinum  >  gold  >  silver
                   3      >   2    >    1
                   
* One hot encoding for nominal
    - lets assume cat column data: a, b, c

a|b|c
--|--|--
1|0|0
1|0|0
0|1|0
0|0|1

**Label encoding**
* Here we are giving manually without using label encoder function.

In [None]:
xtest.head()

**Gender**
* Marital_status is replaced with the order of

            female   >    male
               0     >     1 

In [None]:
xtrain.Gender.unique()

In [None]:
xtest.Gender.unique()

In [None]:
xtrain.Gender.replace({'female':0,'male':1},inplace=True)
xtest.Gender.replace({'female':0,'male':1},inplace=True)

**Marital_status**
* Marital_status is replaced with the order of

        single   >   married  >    prefer_not_to_say  
         1       >      2      >        3   

In [None]:
display(xtrain.Marital_Status.unique())
xtest.Marital_Status.unique()

In [None]:
xtrain.Marital_Status.replace({'single':1,'married':2,'prefer_not_to_say':3},inplace=True)
xtest.Marital_Status.replace({'single':1,'married':2,'prefer_not_to_say':3},inplace=True)

**Occupation**
* Occupation is replaced with the order of

        student > employee > self_employeed > house_wife 
         1      >     2    >      3         >     4  

In [None]:
display(xtrain.Occupation.unique())
xtest.Occupation.unique()

In [None]:
xtrain.Occupation.replace({'student':1,'employee':2,'self_employeed':3,'house_wife':4},inplace=True)
xtest.Occupation.replace({'student':1,'employee':2,'self_employeed':3,'house_wife':4},inplace=True)

**Monthly_income**
* Monthly_Income is replaced with the order of

        no_income > low_income > moderate_income > high_income > very_high_income  
           1      >     2      >      3          >     4       >        5 

In [None]:
display(xtrain.Monthly_Income.unique())
xtest.Monthly_Income.unique()

In [None]:
xtrain.Monthly_Income.replace({'no_income':1,'low_income':2,'moderate_income':3,'high_income':4,'very_high_income':5},inplace=True)
xtest.Monthly_Income.replace({'no_income':1,'low_income':2,'moderate_income':3,'high_income':4,'very_high_income':5},inplace=True)

In [None]:
xtrain.head()

**Educational_Qualifications**
* Educational_Qualifications is replaced with the order of

        Uneducated > School > Graduate > Post-graduate > Ph.D  
           1      >    2    >    3     >     4         >   5 

In [None]:
display(xtrain.Educational_Qualifications.unique())
xtest.Educational_Qualifications.unique()

In [None]:
xtrain.Educational_Qualifications.replace({'uneducated':1,'school':2,'graduate':3,'post_graduate':4,'ph_d':5},inplace=True)
xtest.Educational_Qualifications.replace({'uneducated':1,'school':2,'graduate':3,'post_graduate':4,'ph_d':5},inplace=True)

**Feedback**

In [None]:
xtrain.replace({'positive':1,'negative':0}, inplace = True)
xtest.replace({'positive':1,'negative':0}, inplace = True)

In [None]:
display(xtrain.shape)
xtest.shape

In [None]:
display(xtrain.head())
xtest.head()

In [None]:
display(xtrain.reset_index(drop=True))
xtest.reset_index(drop=True)

In [None]:
xtrain.dtypes

**One-Hot encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore',drop = 'first')

# drop is used remove one onehotencoded column for dummy_variable trap

In [None]:
xtrain.head()

**Train**

In [None]:
ohedata = ohe.fit_transform(xtrain.select_dtypes('object')).toarray()

In [None]:
ohedata.shape

In [None]:
# Converting the one hot data to a data frame with col names

ohedata = pd.DataFrame(ohedata, columns = ohe.get_feature_names_out())

In [None]:
ohedata.head()

In [None]:
xtrain.drop(['Area'], axis = 1).head(2)

In [None]:
# Adding to xtrain data

xtrain = pd.concat([xtrain.drop(['Area'], axis = 1).reset_index(drop=True), ohedata], axis = 1)

xtrain.shape

**xtest**

In [None]:
ohedata_test = ohe.transform(xtest.select_dtypes('object')).toarray()

In [None]:
# Converting the one hot data to a data frame with col names

ohedata_test = pd.DataFrame(ohedata_test, columns = ohe.get_feature_names_out())

In [None]:
# Adding to xtest data
xtest = pd.concat([xtest.drop(['Area'], axis = 1).reset_index(drop=True), ohedata_test], axis = 1)

xtest.shape

In [None]:
xtrain.head(2)

In [None]:
xtest.head(2)

In [None]:
xtrain.describe()

## Scaling
We need to apply scaling for the input numerical continuous columns which are in different scales only ,not to the one hot encoded vectors

Whenever we apply data for linear models we will can go with scaling part

* We have two scales
    - Standard Scale     -3 to +3
    - Min Max Scale       -1 to +1 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
xtrain.iloc[:,[7]]

In [None]:
# Fit on train

xtrain.iloc[:,[7]] = sc.fit_transform(xtrain.iloc[:,[7]])

# Transform on test

xtest.iloc[:,[7]] = sc.transform(xtest.iloc[:,[7]])

In [None]:
display(xtrain.head())
xtest.head()

### 8. Modeling<a id='model'>

[Back to Top](#menu)

* y data is a categorical binary data , we will be using ML Supervised classification algorithms

    - Logistic Regression
    - Knearest Neighbors (KNN)
    - Support Vector Machine (SVM)
    - Naive Bayes (NB)
    - Decision Trees (CART)
    - Random Forest (Bagging)
    - Xgboost (Boosting)

**Importing Libraries and Define Models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
# Model Define

log = LogisticRegression()

knn = KNeighborsClassifier(n_neighbors=5, p=2)

# Here neighbors are the hyperparameter
# Distance is the another hyperparameter (p) 2 for euclidean distance

dt = DecisionTreeClassifier(criterion="entropy", max_depth = 3)

# criteria is the root node selection method
# max_depth is the number of subtrees in decision  tree - main Hyperparameter

rf = RandomForestClassifier(n_estimators = 2, max_depth = 2, criterion = 'entropy', bootstrap = True)

# n_estimators are number of decision trees - Hyper parameter


sv = SVC(kernel = 'rbf', gamma=5) # for a non-linear seperable data

# Gamma=Sigma=coeffient for the rbf kernel - hyperparameter

#Kernel linear-- Linear SVM

#sv = SVC(kernel="linear") # for a linear separable data

nb = GaussianNB()

xgb = XGBClassifier(n_estimators=10,reg_alpha=0.5)

**Model Training**

* Using xtrain, ytrain data
* Using fit command to train the defined model with xtrain, ytrain

#### Logistic Regression

It uses the Linear Regression line to convert it into a sigmoid curve with the logit function output as probability of class

    prob = 1/1+e^-y
    
    if prob>0.5 1 
    else 0

Learning/Training Model on train data

we can use **fit** function in model for xtrain and ytrain data to train our data for getting the line co-efficients

In [None]:
log.fit(xtrain, ytrain)

Parameters

In [None]:
log.intercept_

In [None]:
log.coef_

#### KNN

Knearest Neighbors

It will take the nearest data points using euclidean distance metric with number of k given

It is a lazy algorithm , it wont train the data instead it will store the data

It will do the training when test data given

In [None]:
knn.fit(xtrain, ytrain)

In [None]:
knn.get_params()

Decision Regions Plotting

In [None]:
xtrain.head(2)

In [None]:
x_d = xtrain[['Gender','Feedback']]
y_d = ytrain

In [None]:
y_d

In [None]:
y_d = np.where(y_d =='yes',1,0)

In [None]:
knn1 = KNeighborsClassifier(n_neighbors = 5, p=2)
knn1.fit(x_d,y_d)

In [None]:
from mlxtend.plotting import plot_decision_regions

In [None]:
plot_decision_regions(np.array(x_d),np.array(y_d),clf=knn1)
plt.title("Training KNN Decision Region")
plt.xlabel("Gender")
plt.ylabel("Feedback")
plt.show()

We can tune k value using gridsearch cv to get best params

#### Decision Tree

Logic Tree based predictions based on root and interior nodes, branches

In [None]:
dt.fit(xtrain, ytrain)

**Feature Importance**

In [None]:
pd.DataFrame(index = dt.feature_names_in_,data = dt.feature_importances_, columns = ['FeatureImportance'])

**Tree**

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(7, 6), dpi=150)
plot_tree(dt, filled=True, feature_names=list(xtrain.columns))
plt.show()

#### Random Forest

Bagging algorithm which was a combination of Multiple Decision Trees

In [None]:
rf.fit(xtrain, ytrain)

In [None]:
pd.DataFrame(index = rf.feature_names_in_,data = rf.feature_importances_, columns = ['FeatureImportance'])

Trees

In [None]:
rf.estimators_

In [None]:
plt.figure(figsize = (15,10),dpi = 150)
plot_tree(rf.estimators_[1],filled = True, feature_names=list(xtrain.columns))
plt.show()

#### SVM (Time Taking for Higher Dimensional Data)

Support vectors (Data Points taken to Identify Maximal Margin Classifier) - for linear data

for non-linear data kernel trick is used to divide classes - rbf , poly

#### Naive Bayes

Naive Bayes works on Bayesian Probability formula

In [None]:
nb.fit(xtrain, ytrain)

#### Xgboost

Boosting Algorithm where for the selected number of models , one model error will be trained by another model

we need to install xgboost, using anaconda prompt - pip install xgboost

In [None]:
xgb = XGBClassifier(n_estimators=10,reg_alpha=0.5)

In [None]:
# xgboost accepts label data as number

ytrain_xg = np.where(ytrain == 'yes',1,0)

In [None]:
xgb.fit(xtrain,ytrain_xg)

### 9. Model Performance<a id='eval'>

[Back to Top](#menu)

* Checking Trained Model Performances on Test Data

* Using x_test data we will be getting predictions, these predictions will be compared to y_test

* To check Model Performance we can use evaluation methods

    * Error/Loss
    * Model Score 
    * Bias-Variance Trade off (Underfit or Overfit)
    * Cross-Val Score

For classification we can use these evaluation


Performance Metric | Classification
-------|-----------
**Loss or Error**|**Confusion Matrix (Number of right/wrong predictions)**
**Model Score (Evaluation)** | **Accuracy Score (Balanced Data) , F1-Score/Auc-Roc Score (For Imbalanced Data)**
**Bias-Variance Trade Off**|Higher error & Lower score (underfit)
-|Low Train error & High Test error (Overfit)
**Cross-Val Score**|Checking trained model performance on entire X and y data

**As we have imbalanced data considering auc-roc score for better understanding model**

In [None]:
# Modules for Metrics

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, roc_auc_score, auc
from sklearn.model_selection import cross_val_score

**Checking the above models perfomance using Test data**

In [None]:
names = ['LogisticRegression', 'KNearestNeighbors', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'Xgboost']

models = [log, knn, nb, dt, rf, xgb]

# KNN & SVM will take more time

**Confusion_matrix , Classification_report & auc-roc curve score**

In [None]:
for i in range(len(models)):
    print(green("Model: {}\n".format(names[i]),['bold']))
    if models[i] == xgb:
        ytest_pred = models[i].predict(xtest)
        ytest = np.where(ytest == 'yes',1,0)
        print("Classification Report:\n",classification_report(ytest, ytest_pred))
        print(blue("Confusion_Matrix:",['bold']))
        plt.show(ConfusionMatrixDisplay.from_estimator(models[i], xtest, ytest))
        print(red("AUC-ROC:",['bold']))
        fpr, tpr, threshold = roc_curve(ytest, ytest_pred)
        ytest = np.where(ytest == 1,'yes','no')
        auc_score = auc(fpr, tpr)
        auc_str = "AUC Score: "+ str(auc_score)
        plt.plot(fpr,tpr,label=auc_str)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend()
        plt.show()
        print("-----------------------------------------------------------------------------------")
    else:
        ytest_pred = models[i].predict(xtest)
        print("Classification Report:\n",classification_report(ytest, ytest_pred))
        print(blue("Confusion_Matrix:",['bold']))
        plt.show(ConfusionMatrixDisplay.from_estimator(models[i], xtest, ytest))
        print(red("AUC-ROC:",['bold']))
        ytest = np.where(ytest == 'yes',1,0)
        ytest_pred = np.where(ytest_pred == 'yes',1,0)
        fpr, tpr, threshold = roc_curve(ytest, ytest_pred)
        ytest = np.where(ytest == 1,'yes','no')
        auc_score = auc(fpr, tpr)
        auc_str = "AUC Score: "+ str(auc_score)
        plt.plot(fpr,tpr,label=auc_str)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC Curve")
        plt.legend()
        plt.show()
        print("-----------------------------------------------------------------------------------")

**Checking accuracy_scores for train and test both**

In [None]:
Model = []
Trainedmodel = []
trainscore = []
testscore = []
fit = []

In [None]:
for i in range(len(models)):
    
    if models[i] == xgb:
        
        ytrain = np.where(ytrain == 'yes',1,0)
        ytest = np.where(ytest == 'yes',1,0)
        
        ytrain_pred = models[i].predict(xtrain)
        ytest_pred = models[i].predict(xtest)
        
        fpr_tr, tpr_tr, threshold_tr = roc_curve(ytrain, ytrain_pred)
        auc_score_tr = auc(fpr_tr, tpr_tr)
        
        fpr_te, tpr_te, threshold_te = roc_curve(ytest, ytest_pred)
        auc_score_te = auc(fpr_te, tpr_te)

        trainscore.append(auc_score_tr)
        testscore.append(auc_score_te)

        trscore = auc_score_tr
        tescore = auc_score_te

        if trscore<0.50 and tescore<0.50:
            fit.append("Underfit")
        elif trscore>0.70 and tescore<0.60:
            fit.append("Overfit")
        else:
            fit.append("Goodfit")
        
        ytrain = np.where(ytrain == 1,'yes','no')
        ytest = np.where(ytest == 1,'yes','no')
        
    else:
        
        ytrain = np.where(ytrain == 'yes',1,0)
        ytest = np.where(ytest == 'yes',1,0)
        
        ytrain_pred = models[i].predict(xtrain)
        ytest_pred = models[i].predict(xtest)
        
        ytrain_pred = np.where(ytrain_pred == 'yes',1,0)
        ytest_pred = np.where(ytest_pred == 'yes',1,0)
        
        fpr_tr, tpr_tr, threshold_tr = roc_curve(ytrain, ytrain_pred)
        auc_score_tr = auc(fpr_tr, tpr_tr)
        
        fpr_te, tpr_te, threshold_te = roc_curve(ytest, ytest_pred)
        auc_score_te = auc(fpr_te, tpr_te)

        trainscore.append(auc_score_tr)
        testscore.append(auc_score_te)

        trscore = auc_score_tr
        tescore = auc_score_te

        if trscore<0.50 and tescore<0.50:
            fit.append("Underfit")
        elif trscore>0.70 and tescore<0.60:
            fit.append("Overfit")
        else:
            fit.append("Goodfit")
        
        ytrain = np.where(ytrain == 1,'yes','no')
        ytest = np.where(ytest == 1,'yes','no')
        
        ytrain_pred = np.where(ytrain_pred == 1,'yes','no')
        ytest_pred = np.where(ytest_pred == 1,'yes','no')

In [None]:
pd.DataFrame({'Model':names, 'Trainedmodel':models,'Trainscore':trainscore, 'Testscore':testscore, 'Fit':fit})

#### Better Performance Model:

From the Observation of above results

**Xgboost Given Better accuracy score for both train and test compared to Other**

### Real time predictions

In [None]:
raw.head(5)

In [None]:
def predict_orderAgain(data):
    
    from IPython.display import display
    
    rdata = pd.DataFrame([data], columns=['Age', 'Gender', 'Marital_Status', 'Occupation', 'Monthly_Income',
                                          'Educational_Qualifications', 'Family_size', 'Pin_code',
                                          'Output', 'Feedback', 'Area'])
    
    print("Order Again Details:")
    display(rdata.head())
    print()
    
    data = rdata
    
    data.replace({'female': 0, 'male': 1}, inplace=True)
    data.replace({'single': 1, 'married': 2, 'prefer_not_to_say': 3}, inplace=True)
    data.replace({'student': 1, 'employee': 2, 'self_employeed': 3, 'house_wife': 4}, inplace=True)
    data.replace({'no_income': 1, 'low_income': 2, 'moderate_income': 3, 'high_income': 4, 'very_high_income': 5},inplace=True)
    data.replace({'uneducated': 1, 'school': 2, 'graduate': 3, 'post_graduate': 4, 'ph_d': 5}, inplace=True)
    data.replace({'positive': 1, 'negative': 0}, inplace=True)

    ohedata_test = ohe.transform(data.select_dtypes('object')).toarray()

    # Converting the one hot data to a data frame with col names
    ohedata_test = pd.DataFrame(ohedata_test, columns=ohe.get_feature_names_out())

    data = pd.concat([data.drop(['Area'], axis=1).reset_index(drop=True), ohedata_test], axis=1)

    data.iloc[:, [7]] = sc.transform(data.iloc[:, [7]])

    result = xgb.predict(data)[0]

    result = {0: 'no', 1: 'yes'}[result]

    print("OrderAgain (Prediction): ", result)
    print("============================================================")
    print()


In [None]:
predict_orderAgain(20.0,'female','single','student','no_income','post_graduate',4,560001,'yes','positive','Shanthala_Nagar')