In [1]:
#Import packages
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
from pycaret.regression import *

In [2]:
#Import data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
#Want to see correlation
corr_matrix = train_data.select_dtypes(include=['number']).corr()
corr_price = corr_matrix['Price'].sort_values(ascending=False)
print(corr_price)

Price                   1.000000
Weight Capacity (kg)    0.018018
id                      0.002027
Compartments           -0.000131
Name: Price, dtype: float64


In [5]:
train_data.drop('id',axis='columns')

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760,39.17320
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...
299995,Adidas,Leather,Small,9.0,No,No,Tote,Blue,12.730812,129.99749
299996,Jansport,Leather,Large,6.0,No,Yes,Tote,Blue,26.633182,19.85819
299997,Puma,Canvas,Large,9.0,Yes,Yes,Backpack,Pink,11.898250,111.41364
299998,Adidas,Nylon,Small,1.0,No,Yes,Tote,Pink,6.175738,115.89080


In [6]:
experiment = setup(data=train_data, target='Price', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Price
2,Target type,Regression
3,Original data shape,"(300000, 11)"
4,Transformed data shape,"(300000, 27)"
5,Transformed train set shape,"(210000, 27)"
6,Transformed test set shape,"(90000, 27)"
7,Numeric features,3
8,Categorical features,7
9,Rows with missing values,17.8%


In [7]:
#best_model = compare_models(n_select=5)

In [12]:
bayr = create_model('br')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33.7167,1515.9998,38.9358,0.0008,0.598,0.6839
1,33.8011,1523.7899,39.0358,0.0008,0.5984,0.6839
2,33.7637,1521.7777,39.01,0.0007,0.6,0.6877
3,33.9177,1530.1378,39.117,0.0009,0.6035,0.6955
4,33.5438,1506.5481,38.8143,0.0009,0.5925,0.674
5,33.8326,1527.9672,39.0892,0.0017,0.5991,0.6855
6,33.8016,1521.7052,39.009,0.0003,0.5961,0.6803
7,33.5752,1506.569,38.8145,0.0014,0.5918,0.6719
8,33.6529,1515.8496,38.9339,0.0005,0.598,0.6838
9,33.8214,1528.4136,39.0949,0.001,0.5995,0.6858


In [13]:
tuned_model = tune_model(bayr,optimize='RMSE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33.7167,1515.9978,38.9358,0.0008,0.598,0.6839
1,33.801,1523.7941,39.0358,0.0008,0.5984,0.6839
2,33.7636,1521.7823,39.01,0.0007,0.6,0.6877
3,33.9175,1530.1298,39.1169,0.0009,0.6035,0.6955
4,33.5436,1506.54,38.8142,0.0009,0.5925,0.674
5,33.8325,1527.9565,39.0891,0.0017,0.5991,0.6855
6,33.8016,1521.7135,39.0091,0.0003,0.5961,0.6803
7,33.5751,1506.5627,38.8145,0.0014,0.5918,0.6719
8,33.6528,1515.8533,38.934,0.0005,0.598,0.6838
9,33.8213,1528.4157,39.095,0.001,0.5995,0.6858


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [14]:
final_model = finalize_model(tuned_model)

In [12]:
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [15]:
save_model(final_model, 'final backpack carotmodel2')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['id', 'Compartments',
                                              'Weight Capacity (kg)'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['Brand', 'Material', 'Size',
                                              'Laptop Compartment', 'Waterproof',
                                              'Style', 'Color'],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('o...
 Yes    1
 NaN   -1
 dtype: int64}]))),
                 ('onehot_encoding',
                  TransformerWrapper(include=['Brand', 'Material', 'Size',
                                              'Style', 'Color'],
                                     transformer=OneHotEncoder(cols=['Brand',
                                        

In [10]:
predictions = predict_model(final_model,data=test_data)

In [14]:
submission = pd.DataFrame({
    'id':test_data['id'],
    'Price':predictions['prediction_label']
})

In [18]:
submission

Unnamed: 0,id,Price
0,300000,82.070216
1,300001,82.358069
2,300002,81.643198
3,300003,81.827306
4,300004,78.679870
...,...,...
199995,499995,80.603221
199996,499996,81.189817
199997,499997,83.471162
199998,499998,81.888697


In [19]:
submission.to_csv('submission.csv', index=False)