In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import preprocessing as ps
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid", palette="Set2")


In [3]:
df=pd.read_csv('../datasets/clean_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_per_person,tip_per_person,big_party
0,0,16.99,1.01,0,0,2,0,2,8.495,0.505,0
1,1,10.34,1.66,1,0,2,0,3,3.446667,0.553333,0
2,2,21.01,3.5,1,0,2,0,3,7.003333,1.166667,0
3,3,23.68,3.31,1,0,2,0,2,11.84,1.655,0
4,4,24.59,3.61,0,0,2,0,4,6.1475,0.9025,1


In [4]:
df=df.drop(columns=['Unnamed: 0'],axis=1)

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_per_person,tip_per_person,big_party
0,16.99,1.01,0,0,2,0,2,8.495,0.505,0
1,10.34,1.66,1,0,2,0,3,3.446667,0.553333,0
2,21.01,3.5,1,0,2,0,3,7.003333,1.166667,0
3,23.68,3.31,1,0,2,0,2,11.84,1.655,0
4,24.59,3.61,0,0,2,0,4,6.1475,0.9025,1


In [12]:
df.describe()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_per_person,tip_per_person,big_party
count,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0,239.0
mean,19.191799,2.917782,0.635983,0.380753,1.748954,0.284519,2.539749,7.784503,1.199364,0.179916
std,7.9722,1.230622,0.482163,0.486591,0.932461,0.452132,0.924395,2.819597,0.470868,0.384924
min,3.07,1.0,0.0,0.0,0.0,0.0,1.0,2.875,0.4,0.0
25%,13.275,2.0,0.0,0.0,1.0,0.0,2.0,5.79375,0.86,0.0
50%,17.51,2.75,1.0,0.0,2.0,0.0,2.0,7.155,1.1,0.0
75%,23.815,3.5,1.0,1.0,3.0,1.0,3.0,9.135,1.5,0.0
max,44.3,7.58,1.0,1.0,3.0,1.0,6.0,20.275,2.925,1.0


In [6]:
x = df.drop(columns=['tip'])  # Features (all columns except 'price')
y = df['tip'] 

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [8]:
# Train Model Function (updated for regression)
def train_model(model, model_name, x_train, y_train, x_test, y_test, is_classification=True):
    
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Predictions on the training data
    y_train_pred = model.predict(x_train)
    # Predictions on the testing data
    y_test_pred = model.predict(x_test)
    
    # Collecting results in a dictionary
    model_results = {
        'model': model_name
    }
    
    if is_classification:
        # Classification metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        model_results['train_accuracy'] = train_accuracy
        model_results['test_accuracy'] = test_accuracy
        
        # Generate classification report for testing data (optional)
        report = classification_report(y_test, y_test_pred, output_dict=True)
        model_results['classification_report'] = report
    else:
        # Regression metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        model_results['train_mse'] = train_mse
        model_results['test_mse'] = test_mse
        model_results['train_r2'] = train_r2
        model_results['test_r2'] = test_r2
    
    return model_results

# List of models (for regression or classification tasks)
model_list = dict(
    linear_regression=LinearRegression(),
    ridge=Ridge(alpha=1.0),
    lasso=Lasso(alpha=0.1),
    svr=SVR(kernel='rbf'),
    decision_tree=DecisionTreeRegressor(random_state=42),
    random_forest=RandomForestRegressor(n_estimators=50, random_state=42),
    xgboost=xgb.XGBRegressor(max_depth=1),
    gradientboost=GradientBoostingRegressor(n_estimators=100, random_state=42),
    adaboost=AdaBoostRegressor(n_estimators=100, random_state=42),  # Added AdaBoost
)

# Assume you have your x_train, x_test, y_train, and y_test already defined
results = []
for key, value in model_list.items():
    # Check if classification or regression model and train accordingly
    is_classification =False  # change as per your use-case for classification vs regression
    model_results = train_model(value, key, x_train, y_train, x_test, y_test, is_classification)
    results.append(model_results)

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Print or return the results DataFrame
results_df

Unnamed: 0,model,train_mse,test_mse,train_r2,test_r2
0,linear_regression,0.079441,0.077477,0.951384,0.937916
1,ridge,0.080674,0.084271,0.95063,0.932471
2,lasso,0.159204,0.185673,0.902571,0.851215
3,svr,0.789408,0.621116,0.516903,0.502282
4,decision_tree,0.0,0.121406,1.0,0.902714
5,random_forest,0.031715,0.084878,0.980591,0.931985
6,xgboost,0.058207,0.097312,0.964379,0.922021
7,gradientboost,0.002157,0.032882,0.99868,0.97365
8,adaboost,0.092198,0.191529,0.943578,0.846522


In [11]:
x.iloc[0,:]

total_bill         16.990
sex                 0.000
smoker              0.000
day                 2.000
time                0.000
size                2.000
bill_per_person     8.495
tip_per_person      0.505
big_party           0.000
Name: 0, dtype: float64

In [14]:
df.describe().loc[['min', 'max']]


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,bill_per_person,tip_per_person,big_party
min,3.07,1.0,0.0,0.0,0.0,0.0,1.0,2.875,0.4,0.0
max,44.3,7.58,1.0,1.0,3.0,1.0,6.0,20.275,2.925,1.0


In [15]:
import joblib  # or import pickle
model = LinearRegression()
model.fit(x_train, y_train)



In [18]:
# Save model to a file
joblib.dump(model, 'linear_model.pkl')

['linear_model.pkl']

In [19]:
loaded_model = joblib.load('linear_model.pkl')


In [21]:
predictions = loaded_model.predict(x_test)
print(predictions[:5])  # Display first 5 predictions

[3.22084079 2.01888439 3.27300537 3.0724809  1.49045226]


In [22]:
y_test[:5]

24     3.18
6      2.00
93     3.25
109    3.00
104    1.64
Name: tip, dtype: float64