<a href="https://colab.research.google.com/github/chandan3324/Machine-Learning/blob/main/17_Backpack_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Backpack Prediction Challenge

---



### Step 1 - Data Ingestion

In [44]:
import pandas as pd
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


### Step 2 - Perform Basic Data Quality Checks

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [46]:
df.isna().sum()

Unnamed: 0,0
id,0
Brand,9705
Material,8347
Size,6595
Compartments,0
Laptop Compartment,7444
Waterproof,7050
Style,7970
Color,9950
Weight Capacity (kg),138


In [47]:
df.duplicated().sum()

0

### Seperating X and Y

In [48]:
df.columns

Index(['id', 'Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment',
       'Waterproof', 'Style', 'Color', 'Weight Capacity (kg)', 'Price'],
      dtype='object')

In [49]:
X = df.drop(columns=['id', 'Price'])
Y = df[['Price']]

In [50]:
X.head()

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338


In [51]:
Y.head()

Unnamed: 0,Price
0,112.15875
1,68.88056
2,39.1732
3,80.60793
4,86.02312


### Step 4 - Apply Preprocessing on X

In [52]:
X.dtypes

Unnamed: 0,0
Brand,object
Material,object
Size,object
Compartments,float64
Laptop Compartment,object
Waterproof,object
Style,object
Color,object
Weight Capacity (kg),float64


In [53]:
cat = list(X.columns[X.dtypes == 'object'])
con = list(X.columns[X.dtypes != 'object'])

In [54]:
cat

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [55]:
con

['Compartments', 'Weight Capacity (kg)']

In [56]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [57]:
num_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

In [58]:
cat_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)

In [59]:
pre = ColumnTransformer(
    [
        ('num', num_pipe, con),
        ('cat', cat_pipe, cat)
    ]
).set_output(transform='pandas')

In [60]:
X_pre = pre.fit_transform(X)

In [61]:
X_pre.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
0,0.538408,-0.921466,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.576198,1.299086,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-1.19124,-0.199023,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.884338,-0.731166,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.53717,-0.040296,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Step 5 - Train Test Split

In [62]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.2, random_state=46)

In [63]:
xtrain.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
161325,-0.84531,-1.403619,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
163496,-1.19124,-0.949336,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
261697,0.884338,1.512853,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
82951,-0.84531,1.176341,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
262857,-0.499381,-0.815491,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [64]:
ytrain.head()

Unnamed: 0,Price
161325,87.75687
163496,134.04751
261697,99.73909
82951,124.35109
262857,28.86967


In [65]:
xtest.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
90250,1.576198,1.180625,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
68766,-0.153451,1.27036,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
28522,-0.499381,0.308253,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11059,0.884338,-1.870704,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
101803,-1.53717,-0.877619,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [66]:
ytest.head()

Unnamed: 0,Price
90250,131.7932
68766,90.45775
28522,45.84078
11059,115.80788
101803,15.17464


In [67]:
xtrain.shape

(240000, 27)

In [68]:
xtest.shape

(60000, 27)

In [69]:
ytrain.shape

(240000, 1)

In [70]:
ytest.shape

(60000, 1)

In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [77]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(),
    RandomForestRegressor(max_depth=3)
]

In [78]:
from sklearn.metrics import root_mean_squared_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

def evaluate_single_model(model, xtrain, ytrain, xtest, ytest):

    # Fit the model
    model.fit(xtrain,ytrain)

    # Predict the results for train and test
    ypred_train = model.predict(xtrain)
    ypred_test = model.predict(xtest)

    #Calculate the regression metrics for train and test
    rmse_train = root_mean_squared_error(ytrain, ypred_train)
    rmse_test = root_mean_squared_error(ytest, ypred_test)

    mse_train = mean_squared_error(ytrain, ypred_train)
    mse_test = mean_squared_error(ytest, ypred_test)

    r2_train = r2_score(ytrain, ypred_train)
    r2_test = r2_score(ytest, ypred_test)

    # 5-fold cross-validation on training data for R²
    scores = cross_val_score(model, xtrain, ytrain, cv=5, scoring="r2", n_jobs=-1)
    r2_cv = scores.mean()

    # Create a dictionary for the final results
    res = {
        "model_name": model.__class__.__name__,
        "model": model,
        "rmse_train": rmse_train,
        "mse_train": mse_train,
        "r2_train": r2_train,
        "rmse_test": rmse_test,
        "mse_test": mse_test,
        "r2_test": r2_test,
        "r2_cv": r2_cv
    }
    return res

In [79]:
def algo_evaluation(models : list, xtrain, ytrain, xtest, ytest):

    # Initialize blank list for results
    results = []

    # Apply for loop on models
    for model in models:
        r = evaluate_single_model(model, xtrain, ytrain, xtest, ytest)
        print(r)
        results.append(r)

    # Save the results in dataframe
    res_df = pd.DataFrame(results)

    # Sort the results
    sort_df = res_df.sort_values(by="rmse_test").reset_index(drop=True)

    # Get the best model
    best_model = sort_df.iloc[0]["model"]

    return sort_df.round(4), best_model

In [80]:
models

[LinearRegression(),
 DecisionTreeRegressor(),
 RandomForestRegressor(max_depth=3)]

In [81]:
res_df, best_model = algo_evaluation(models, xtrain.values, ytrain.values, xtest.values, ytest.values)

{'model_name': 'LinearRegression', 'model': LinearRegression(), 'rmse_train': 39.01136573962545, 'mse_train': 1521.8866568708222, 'r2_train': 0.001126261047693422, 'rmse_test': 39.0461598263179, 'mse_test': 1524.6025971823617, 'r2_test': 0.0008596000487532951, 'r2_cv': 0.0009405582096783105}
{'model_name': 'DecisionTreeRegressor', 'model': DecisionTreeRegressor(), 'rmse_train': 0.6948837247578419, 'mse_train': 0.4828633909333322, 'r2_train': 0.9996830778701375, 'rmse_test': 56.12773866800058, 'mse_test': 3150.323047983368, 'r2_test': -1.0645478605079655, 'r2_cv': -1.0572947360788234}


  return fit_method(estimator, *args, **kwargs)


{'model_name': 'RandomForestRegressor', 'model': RandomForestRegressor(max_depth=3), 'rmse_train': 39.00873135770984, 'mse_train': 1521.6811221379749, 'r2_train': 0.0012611615320078373, 'rmse_test': 39.040611055065426, 'mse_test': 1524.169311552897, 'r2_test': 0.0011435515374341731, 'r2_cv': 0.0008752214677744119}


In [82]:
res_df

Unnamed: 0,model_name,model,rmse_train,mse_train,r2_train,rmse_test,mse_test,r2_test,r2_cv
0,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=3, max_featur...",39.0087,1521.6811,0.0013,39.0406,1524.1693,0.0011,0.0009
1,LinearRegression,LinearRegression(),39.0114,1521.8867,0.0011,39.0462,1524.6026,0.0009,0.0009
2,DecisionTreeRegressor,DecisionTreeRegressor(),0.6949,0.4829,0.9997,56.1277,3150.323,-1.0645,-1.0573


In [83]:
best_model

### From above metrics we can say Random Forest is the best model here evaluated using RMSE.
### We can use it for Out of Sample Prediction

In [84]:
xnew = pd.read_csv("test.csv")
xnew.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [None]:
pre

In [85]:
xnew_pre = pre.transform(xnew)
xnew_pre.head()

Unnamed: 0,num__Compartments,num__Weight Capacity (kg),cat__Brand_Adidas,cat__Brand_Jansport,cat__Brand_Nike,cat__Brand_Puma,cat__Brand_Under Armour,cat__Material_Canvas,cat__Material_Leather,cat__Material_Nylon,...,cat__Waterproof_Yes,cat__Style_Backpack,cat__Style_Messenger,cat__Style_Tote,cat__Color_Black,cat__Color_Blue,cat__Color_Gray,cat__Color_Green,cat__Color_Pink,cat__Color_Red
0,-1.19124,0.379185,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.538408,-0.641165,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.230268,-0.893029,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.53717,0.064179,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.19124,-1.166074,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [86]:
preds = best_model.predict(xnew_pre)
preds



array([81.53269713, 82.4008103 , 82.36433828, ..., 81.52358569,
       81.50504165, 80.81079622])

In [87]:
res = xnew[["id"]]
res

Unnamed: 0,id
0,300000
1,300001
2,300002
3,300003
4,300004
...,...
199995,499995
199996,499996
199997,499997
199998,499998


In [89]:
res['Price'] = preds.round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['Price'] = preds.round(2)


In [90]:
res

Unnamed: 0,id,Price
0,300000,81.53
1,300001,82.40
2,300002,82.36
3,300003,81.51
4,300004,79.43
...,...,...
199995,499995,80.43
199996,499996,78.33
199997,499997,81.52
199998,499998,81.51


In [91]:
res.to_csv("Submission.csv", index=False)