# Pipeline using sklearn for Big Basket Food Delivery Service

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from scipy import stats
from sklearn import set_config
from sklearn.preprocessing import FunctionTransformer
set_config(display='diagram')

# load data
df = pd.read_csv('Big Basket Food Delivery.csv', encoding='latin-1')
df = df.drop(['Restaurant Name', 'City', 'Locality', 'Locality Verbose'], axis=1)

# Impute missing values in the 'Cuisines' column with the mode value
cuisine_imputer = SimpleImputer(strategy='most_frequent')
df['Cuisines'] = cuisine_imputer.fit_transform(df[['Cuisines']])

# Replace special characters with NaN
df['Average Cost for two'] = df['Average Cost for two'].replace({'\$': '', ',': ''}, regex=True)
df['Average Cost for two'] = pd.to_numeric(df['Average Cost for two'], errors='coerce')
df['Price range'] = df['Price range'].replace({'\₹': ''}, regex=True)
df['Price range'] = pd.to_numeric(df['Price range'], errors='coerce')

# Impute median value for NaN
avg_cost_imputer = SimpleImputer(strategy='median')
df['Average Cost for two'] = avg_cost_imputer.fit_transform(df[['Average Cost for two']])
price_imputer = SimpleImputer(strategy='median')
df['Price range'] = price_imputer.fit_transform(df[['Price range']])

# Outlier treatment using the IQR method
def outlier_treatment_iqr(column):
    q1, q3 = np.percentile(column, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    return np.clip(column, lower_bound, upper_bound)

# Convert non-numeric columns to numeric using label encoding
label_encoder = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])

##  Define the preprocessing steps

In [2]:
# Define the preprocessing steps for categorical and numerical data
categorical_transformer = Pipeline(steps=[
    ('cuisine_imputer', SimpleImputer(strategy='most_frequent')),
    ('label_encoder', LabelEncoder())
])

numerical_transformer = Pipeline(steps=[
    ('avg_cost_imputer' , SimpleImputer(strategy='median')),
    ('price_imputer', SimpleImputer(strategy='median')),
    ('outlier_treatment', FunctionTransformer(func=outlier_treatment_iqr))
])

# Combine the preprocessing steps for categorical and numerical data
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, ['Address', 'Cuisines', 'Has Table booking', 'Has Online delivery', 'Is delivering now','Switch to order menu','Rating color','Rating text']),
    ('num', numerical_transformer, ['Restaurant ID', 'Country Code', 'Longitude','Latitude','Average Cost for two','Price range','Aggregate rating','Votes'])
])

# Visualizing Through Pipeline

In [3]:
categorical_transformer

In [4]:
numerical_transformer

In [5]:
preprocessor

# Model building Through pipeline

In [6]:
# Fit the pipeline
X = df.drop(['Aggregate rating'], axis=1)
y = df['Aggregate rating']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

In [8]:
# Define the random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', rf)
])

In [9]:
pipeline

In [10]:
# Fit the pipeline on the training data
rf.fit(X_train, y_train)

In [11]:
# Use the pipeline to make predictions on the test data
y_pred = rf.predict(X_test)

In [12]:
from sklearn.metrics import r2_score

# Calculate the R-squared score of the predictions
r2 = r2_score(y_test, y_pred)
print('R-squared Score:', r2)

R-squared Score: 0.9878875769302083


In [13]:
# Create a pipeline
pipeline = Pipeline([
    ('regressor', RandomForestRegressor(random_state=42))
])

# Define the grid search parameters
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [5, 10, 15, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
}

# Instantiate the grid search object
grid_search = GridSearchCV(
    pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the pipeline on the training data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [13]:
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'regressor__max_depth': 15, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 300}
Best score:  0.9881109895053124
