In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly.express as px
import plotly.graph_objects as go
import scipy.optimize

In [110]:
data = sns.load_dataset("tips")
data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [111]:
features = data[['total_bill']]
tip = data['tip']

f = LinearRegression(fit_intercept=False)
f.fit(features,tip)


0,1,2
,fit_intercept,False
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [112]:
f.predict([[100]])


X does not have valid feature names, but LinearRegression was fitted with feature names



array([14.37318953])

In [113]:
data["predictions"] = f.predict(features)
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,predictions
0,16.99,1.01,Female,No,Sun,Dinner,2,2.442005
1,10.34,1.66,Male,No,Sun,Dinner,3,1.486188
2,21.01,3.50,Male,No,Sun,Dinner,3,3.019807
3,23.68,3.31,Male,No,Sun,Dinner,2,3.403571
4,24.59,3.61,Female,No,Sun,Dinner,4,3.534367
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,4.172537
240,27.18,2.00,Female,Yes,Sat,Dinner,2,3.906633
241,22.67,2.00,Male,Yes,Sat,Dinner,2,3.258402
242,17.82,1.75,Male,No,Sat,Dinner,2,2.561302


In [114]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y=data['tip'],
                    mode='markers',
                    name='Actual Tips'))
fig.add_trace(go.Scatter(x=data['total_bill'], y=data['predictions'],
                    mode='lines', name='Predicted Tips'))
fig.update_layout(title='Total Bill vs Tip with Predictions',
                   xaxis_title='Total Bill')
fig.update_layout(width=800, height=600)
fig.show()

In [115]:
print("Co Eff (Slope): ", f.coef_)
print("intercept: ", f.intercept_)

Co Eff (Slope):  [0.1437319]
intercept:  0.0


In [116]:
f_with_intercept = LinearRegression(fit_intercept=True)
f_with_intercept.fit(features, tip) 

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [117]:
f_with_intercept.predict([[100]])


X does not have valid feature names, but LinearRegression was fitted with feature names



array([11.42272135])

In [118]:
print("Co Eff (Slope): ", f_with_intercept.coef_)
print("intercept: ", f_with_intercept.intercept_)

Co Eff (Slope):  [0.10502452]
intercept:  0.9202696135546731


In [119]:
data['predictions_with_intercept'] = f_with_intercept.predict(features)

fig = go.Figure()
fig.add_trace(go.Scatter(x=data['total_bill'], y=data['tip'],
                    mode='markers',
                    name='Actual Tips'))
fig.add_trace(go.Scatter(x=data['total_bill'], y=data['predictions'],   
                    mode='lines', name='Predicted Tips (No Intercept)'))
fig.add_trace(go.Scatter(x=data['total_bill'], y=data['predictions_with_intercept'],
                    mode='lines', name='Predicted Tips (With Intercept)'))  
fig.update_layout(title='Total Bill vs Tip with Predictions (With and Without Intercept)',
                   xaxis_title='Total Bill')  
fig.update_layout(width=800, height=600)  
fig.show() 

# Loss Function 

In [120]:
data["l2_loss"] = (data['tip'] - data['predictions_with_intercept'])**2
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,predictions,predictions_with_intercept,l2_loss
0,16.99,1.01,Female,No,Sun,Dinner,2,2.442005,2.704636,2.871792
1,10.34,1.66,Male,No,Sun,Dinner,3,1.486188,2.006223,0.119870
2,21.01,3.50,Male,No,Sun,Dinner,3,3.019807,3.126835,0.139252
3,23.68,3.31,Male,No,Sun,Dinner,2,3.403571,3.407250,0.009458
4,24.59,3.61,Female,No,Sun,Dinner,4,3.534367,3.502822,0.011487
...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,4.172537,3.969131,3.805888
240,27.18,2.00,Female,Yes,Sat,Dinner,2,3.906633,3.774836,3.150043
241,22.67,2.00,Male,Yes,Sat,Dinner,2,3.258402,3.301175,1.693057
242,17.82,1.75,Male,No,Sat,Dinner,2,2.561302,2.791807,1.085361


In [121]:
print("Mean Squared Error (no intercept): ", mean_squared_error(data['tip'], data['predictions'] ))
print("Mean Squared Error (with intercept): ", mean_squared_error(data['tip'], data['predictions_with_intercept']))

Mean Squared Error (no intercept):  1.1781161154513171
Mean Squared Error (with intercept):  1.036019442011377


# Optimizint L2 Loss

In [122]:
def mse_given_theta(theta):
    return mean_squared_error(data['total_bill']* theta, data['tip'])

In [123]:
theta = np.linspace(0.1, 0.2, 100)
mse_values = [mse_given_theta(t) for t in theta]    
mse_values

[2.0777683729508194,
 2.0366887534058913,
 1.996569059699077,
 1.9574092918303747,
 1.919209449799786,
 1.8819695336073097,
 1.8456895432529465,
 1.8103694787366964,
 1.7760093400585586,
 1.7426091272185338,
 1.7101688402166224,
 1.678688479052823,
 1.6481680437271375,
 1.6186075342395636,
 1.5900069505901033,
 1.5623662927787565,
 1.5356855608055218,
 1.5099647546704,
 1.4852038743733909,
 1.461402919914495,
 1.4385618912937121,
 1.4166807885110417,
 1.3957596115664843,
 1.37579836046004,
 1.3567970351917082,
 1.3387556357614898,
 1.3216741621693837,
 1.3055526144153906,
 1.2903909924995107,
 1.2761892964217436,
 1.262947526182089,
 1.2506656817805475,
 1.2393437632171185,
 1.2289817704918033,
 1.2195797036046003,
 1.2111375625555103,
 1.2036553473445333,
 1.197133057971669,
 1.1915706944369175,
 1.1869682567402793,
 1.1833257448817533,
 1.1806431588613406,
 1.1789204986790405,
 1.1781577643348538,
 1.1783549558287796,
 1.1795120731608182,
 1.18162911633097,
 1.1847060853392344,
 1.18

In [124]:
fig = px.line(x=theta, y=mse_values,  title='MSE vs Theta')
fig.update_layout(xaxis_title = "θ")
fig.update_layout(yaxis_title = 'Mean Squared Error')
fig.update_layout(width=800, height=600)
fig.show()  

In [125]:
scipy.optimize.minimize(mse_given_theta, x0=0.2)    

  message: Optimization terminated successfully.
  success: True
   status: 0
      fun: 1.1781161154513287
        x: [ 1.437e-01]
      nit: 1
      jac: [ 2.384e-06]
 hess_inv: [[1]]
     nfev: 6
     njev: 3

# Multiple dimentional linear regression

In [126]:
# Multiple dimensional linear regression
features = ['total_bill', 'size']
tip = data['tip']
f2 = LinearRegression(fit_intercept=False)
f2.fit(data[features], tip)

0,1,2
,fit_intercept,False
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [127]:
print ("Coefficients (no intercept) (Slope): ", f2.coef_)

f2.predict([[10, 3]])

Coefficients (no intercept) (Slope):  [0.1007119  0.36209717]



X does not have valid feature names, but LinearRegression was fitted with feature names



array([2.09341054])

In [128]:
data["prediction_2d"] = f2.predict(data[features])
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,predictions,predictions_with_intercept,l2_loss,prediction_2d
0,16.99,1.01,Female,No,Sun,Dinner,2,2.442005,2.704636,2.871792,2.435290
1,10.34,1.66,Male,No,Sun,Dinner,3,1.486188,2.006223,0.119870,2.127653
2,21.01,3.50,Male,No,Sun,Dinner,3,3.019807,3.126835,0.139252,3.202249
3,23.68,3.31,Male,No,Sun,Dinner,2,3.403571,3.407250,0.009458,3.109052
4,24.59,3.61,Female,No,Sun,Dinner,4,3.534367,3.502822,0.011487,3.924894
...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,4.172537,3.969131,3.805888,4.009958
240,27.18,2.00,Female,Yes,Sat,Dinner,2,3.906633,3.774836,3.150043,3.461544
241,22.67,2.00,Male,Yes,Sat,Dinner,2,3.258402,3.301175,1.693057,3.007333
242,17.82,1.75,Male,No,Sat,Dinner,2,2.561302,2.791807,1.085361,2.518880


In [129]:
print("Mean Squared Error (1d modal): ", mean_squared_error(data["predictions"], data["tip"]))
print("Mean Squared Error (2d modal): ", mean_squared_error(data["prediction_2d"], data["tip"]))

Mean Squared Error (1d modal):  1.1781161154513171
Mean Squared Error (2d modal):  1.06482122862577


In [130]:
px.scatter_3d(data, x="total_bill", y="size", z="tip", color="prediction_2d", title="3D Scatter plot of Tips Prediction", width=800, height=600)

## Mesh Grid for 3D surface plot

In [142]:
table_bills, table_size = np.meshgrid(range(50), range(6))
tip_predictions = (0.1007119 * table_bills + 0.3621 * table_size)

fig = go.Figure()
fig.add_trace(go.Scatter3d(x=data["total_bill"], y=data["size"], z=data["tip"], mode='markers', name='Actual Tips'))
fig.add_trace(go.Surface(x=table_bills, y=table_size, z=tip_predictions,  name='Predicted Tips (1D Model)'))
fig.update_layout(title='3D Scatter plot of Actual vs Predicted Tips', width=800, height=600)
fig.show()


In [147]:
px.scatter(data, x="total_bill", y="tip", title="Tips Prediction with 2D Linear Regression", width=800, height=600, trendline="ols")

In [146]:
px.scatter(data, x="total_bill", y="tip", color="day", title="Tips Prediction with 2D Linear Regression", width=800, height=600, trendline="ols")