## Tip Prediction Using Polynomial Regression

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [11]:
## Load tips dataset from seaborn dataset

df = sns.load_dataset('tips')

## Quick EDA

In [12]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
df.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [14]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [15]:
df.isna().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

We can see that the dataset does not contain any missing value. Thus, we can proceed to feature engineering

## Features Engineering with Polynomial

In [16]:
dfPoly = df.copy()

In [17]:
## Checking Datatypes

dfPoly.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
dtype: object

In [18]:
dfPoly = dfPoly.drop(columns=['sex', 'smoker', 'day', 'time']) ## Drop all categorical features

In [19]:
## Create x and y variables 

X = dfPoly.drop(columns='tip')
y = dfPoly['tip']

In [20]:
## Splitting the dataset for training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= .85, random_state=666)

In [21]:
Poly = PolynomialFeatures(degree = 3, interaction_only=False,include_bias=False)

In [22]:
Poly = Poly.fit(X_train)

In [23]:
## Feature Engineering using Polynomial Features

X_train_Poly = Poly.transform(X_train)
X_test_Poly = Poly.transform(X_test)

In [26]:
## This is what the features will look like after transform it into polynomial

dfXtrainPoly = pd.DataFrame(X_train_Poly)

dfXtrainPoly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,22.12,2.0,489.2944,44.24,4.0,10823.192128,978.5888,88.48,8.0
1,24.08,4.0,579.8464,96.32,16.0,13962.701312,2319.3856,385.28,64.0
2,15.81,2.0,249.9561,31.62,4.0,3951.805941,499.9122,63.24,8.0
3,40.55,2.0,1644.3025,81.1,4.0,66676.466375,3288.605,162.2,8.0
4,14.48,2.0,209.6704,28.96,4.0,3036.027392,419.3408,57.92,8.0


In [27]:
dfXtestPoly = pd.DataFrame(X_test_Poly)

dfXtestPoly.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,20.69,5.0,428.0761,103.45,25.0,8856.894509,2140.3805,517.25,125.0
1,12.48,2.0,155.7504,24.96,4.0,1943.764992,311.5008,49.92,8.0
2,25.0,4.0,625.0,100.0,16.0,15625.0,2500.0,400.0,64.0
3,35.26,4.0,1243.2676,141.04,16.0,43837.615576,4973.0704,564.16,64.0
4,9.94,2.0,98.8036,19.88,4.0,982.107784,197.6072,39.76,8.0


In [28]:
ModelPF = LinearRegression()

In [29]:
ModelPF.fit(dfXtrainPoly, y_train)

In [30]:
PF_train = ModelPF.predict(dfXtrainPoly)
PF_test = ModelPF.predict(dfXtestPoly)

## Evaluation Matrix Calculation

**Training Set**

In [31]:
#R2 Score for Training Set with Polynomial Features

r2_train_PF = r2_score(y_train, PF_train)
r2_train_PF

0.506035348297863

In [32]:
## Mean Absolute Error for Training Set with Polynomial Features

MAE_train_PF = mean_absolute_error(y_train, PF_train)
MAE_train_PF

0.711362865389724

In [33]:
## Mean Squared Error for Training Set with Polynomial Features

MSE_train_PF = mean_squared_error(y_train, PF_train)
MSE_train_PF

0.9628467677265626

In [34]:
## Root Mean Squared Error for Training Set with Polynomial Featueres

RMSE_train_PF = np.sqrt(MSE_train_PF)
RMSE_train_PF

0.9812475568003024

**Testing Set**

In [35]:
## R2 for Testing Set with Polynomial Features

r2_test_PF = r2_score(y_test, PF_test)
r2_test_PF

0.5027046173708265

In [36]:
## Mean Absolute Error for Testing Set with Polynomial Features

MAE_test_PF = mean_absolute_error(y_test, PF_test)
MAE_test_PF

0.6604453282066798

In [37]:
## Mean Squared Error for Testing Set with Polynomial Features

MSE_test_PF = mean_squared_error(y_test, PF_test)
MSE_test_PF

0.8074932035777949

In [38]:
## Root Mean Squared Error for Testing Set with Polynomial Features

RMSE_test_PF = np.sqrt(MSE_test_PF)
RMSE_test_PF

0.8986062561421408

In [39]:
EvalScorePF = {
    'Training' : [r2_train_PF, MAE_train_PF, MSE_train_PF, RMSE_train_PF],
    'Testing' : [r2_test_PF, MAE_test_PF, MSE_test_PF, RMSE_test_PF]
}

Evaldf = pd.DataFrame(EvalScorePF, index=['R Squared', 'MAE', 'MSE', 'RMSE'])
Evaldf

Unnamed: 0,Training,Testing
R Squared,0.506035,0.502705
MAE,0.711363,0.660445
MSE,0.962847,0.807493
RMSE,0.981248,0.898606
