In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

data = pd.read_csv("tips.csv")
print(data.head())

   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4


In [2]:
figure = px.scatter(data, 
                    x="total_bill",
                    y="tip", 
                    size="size", 
                    color= "day", 
                    trendline="ols", 
                    title="Total Bill vs Tip by Day", 
                    height=400,
                    width=1200,
                    opacity=0.5)
figure.show()

In [3]:
figure = px.scatter(data_frame = data, x="total_bill",
                    y="tip", size="size", color= "sex", trendline="ols")
figure.show()

In [4]:
figure = px.scatter(data_frame = data, x="total_bill",
                    y="tip", size="size", color= "time", trendline="ols")
figure.show()

In [5]:
figure = px.pie(data, 
             values='tip', 
             names='day',
             hole = 0.5,
             height=500, width=500)
figure.show()

In [6]:
figure = px.pie(data, 
             values='tip', 
             names='sex',hole = 0.5,
             height=500, width=500)
figure.show()

## Transforming categorical values into numerical values

In [7]:
data["sex"] = data["sex"].map({"Female": 0, "Male": 1})
data["smoker"] = data["smoker"].map({"No": 0, "Yes": 1})
data["day"] = data["day"].map({"Thur": 0, "Fri": 1, "Sat": 2, "Sun": 3})
data["time"] = data["time"].map({"Lunch": 0, "Dinner": 1})
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,3,1,2
1,10.34,1.66,1,0,3,1,3
2,21.01,3.5,1,0,3,1,3
3,23.68,3.31,1,0,3,1,2
4,24.59,3.61,0,0,3,1,4


## Machine Learning Model

In [8]:
# Splitting data into training and test sets
x = np.array(data[["total_bill", "sex", "smoker", "day", 
                   "time", "size"]])
y = np.array(data["tip"])

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, 
                                                test_size=0.2, 
                                                random_state=42)

In [9]:
# Training the model to predict waiter tips
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)

In [10]:
# testing performance of the model
# features = [[total_bill, "sex", "smoker", "day", "time", "size"]]
features = np.array([[34.50, 1, 0, 3, 1, 5]])
model.predict(features)

array([5.1051877])

This model predicts that for a scenario with the following attributes, the tip will be $5.11:<br>
-bill of 34.50 (with tax)<br>
-male waiter<br>
-non smoking table<br>
-on a Sunday<br>
-for dinner<br>
-group of 5 patrons<br>
