In [6]:
# Tips dataset
import seaborn as sns
import numpy as np
import sys
import pandas as pd
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
df = sns.load_dataset('tips')

In [7]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [8]:
# Define X (features) - use the original DataFrame
X = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y = df['total_bill'] # Target

In [9]:
# Define ALL categorical features to be handled by the CT
categorical_features = ['sex', 'smoker', 'day', 'time']

In [10]:
ct = ColumnTransformer(
    transformers=[
        # Apply OHE to all categorical features at once (drop='first' handles binary/multi-class well)
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # 'tip' and 'size' are passed through
)

In [12]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# Fit and transform the training data using the new 'ct'
X_train_processed = ct.fit_transform(X_train) 

# Transform the test data using the fitted 'ct'
X_test_processed = ct.transform(X_test)

In [14]:
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.svm import SVR

param_grid = {
              'C':[0.1, 1, 10, 100, 1000],
              'gamma':[1,0.1,0.01, 0.001, 0.0001],
              'kernel':['rbf']
              }

# 'grid' is the trained model object
grid = GridSearchCV(SVR(), param_grid, refit=True, verbose=2)
grid.fit(X_train_processed, y_train)

print(f"\nBest Parameters found: {grid.best_params_}")
print(f"Best R2 Score found during training: {grid.best_score_:.2f}")

# Evaluate on test data
grid_predicted_test = grid.predict(X_test_processed)
print(f"R2 Score on Test Data: {r2_score(y_test, grid_predicted_test):.2f}")
print(f"MAE on Test Data: {mean_absolute_error(y_test, grid_predicted_test):.2f}")

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.0

In [15]:
# The new input data MUST match the column names of X
new_input_data = pd.DataFrame([
    # [tip, sex, smoker, day, time, size]
    [3.5, 'Male', 'Yes', 'Sun', 'Dinner', 3] 
], columns=['tip', 'sex', 'smoker', 'day', 'time', 'size'])

# Transform the new data using the FITTED transformer 'ct'
new_input_data_transformed = ct.transform(new_input_data)

print("\nTransformed New Data Array (All Numbers):")
print(new_input_data_transformed)

# Make the prediction using the trained 'grid' model
predicted_bill = grid.predict(new_input_data_transformed)

print(f"\nFor the new input data, the predicted total bill is: ${predicted_bill[0]:.2f}")


Transformed New Data Array (All Numbers):
[[1.  1.  0.  1.  0.  0.  3.5 3. ]]

For the new input data, the predicted total bill is: $22.52
