In [21]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures

In [2]:
#Import train and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    300000 non-null  int64  
 1   Brand                 290295 non-null  object 
 2   Material              291653 non-null  object 
 3   Size                  293405 non-null  object 
 4   Compartments          300000 non-null  float64
 5   Laptop Compartment    292556 non-null  object 
 6   Waterproof            292950 non-null  object 
 7   Style                 292030 non-null  object 
 8   Color                 290050 non-null  object 
 9   Weight Capacity (kg)  299862 non-null  float64
 10  Price                 300000 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 25.2+ MB


In [5]:
test_ids = test_data['id']

In [6]:
train_data['Price'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 300000 entries, 0 to 299999
Series name: Price
Non-Null Count   Dtype  
--------------   -----  
300000 non-null  float64
dtypes: float64(1)
memory usage: 2.3 MB


In [7]:
test_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [8]:
test_ids = test_data['id']

In [9]:
columns_to_drop = ['id']
X_train = train_data.drop(columns = columns_to_drop + ['Price'])
y_train = train_data['Price']

X_test = test_data.drop(columns = columns_to_drop)

In [10]:
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns #Select all numeric features eg int and float(dtypes)
categorical_features = X_train.select_dtypes(include=['object']).columns #Select all categorical features (objects)

In [22]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('poly', PolynomialFeatures(degree=2, include_bias=False))  # Add polynomial features
])

In [12]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), #Fill missing values/Nan with the most frequent in that column
    ('onehot', OneHotEncoder(handle_unknown='ignore')) #Ignore nans with the encoding
])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[ #Preprocess the data, split between number and categorical
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [14]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor',GradientBoostingRegressor())])

In [15]:
pipeline.fit(X_train, y_train)

In [16]:
final_model = pipeline.fit(X_train, y_train) 

In [17]:
predictions = final_model.predict(X_test)

In [23]:
from sklearn.model_selection import train_test_split

# Use 10% of the data for hyperparameter tuning
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=0.1, random_state=42)

# Define the parameter grid
param_grid = { #define param grid with 
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.1, 0.2],
    'regressor__max_depth': [3, 5]
}

# Set up Grid Search
grid_search = GridSearchCV(
    estimator=pipeline, #Pipeline from earlier
    param_grid=param_grid, 
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# Fit Grid Search to the smaller dataset
grid_search.fit(X_train_small, y_train_small) #fit the grid on x train and y train

# Best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Retrain the best model on the full dataset
best_model = grid_search.best_estimator_ #use the best estimators on the grid search to fit on x train y train
best_model.fit(X_train, y_train)

Best Hyperparameters: {'regressor__learning_rate': 0.1, 'regressor__max_depth': 3, 'regressor__n_estimators': 100}


In [24]:
# Make predictions on the test set
test_predictions = best_model.predict(X_test)

# Save predictions to a CSV file (if needed)
submission = pd.DataFrame({
    'id': test_ids,
    'Price': test_predictions
})
submission.to_csv('submission7.csv', index=False)