In [1]:
## Uncomment this code to install required packages
# %pip install pandas scikit-learn scikit-learn-intelex xgboost

In [2]:
# Importing required packages
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()

KeyboardInterrupt: 

In [None]:
data = pd.read_csv("dataset/train.csv")
data

Unnamed: 0,row_id,date,state,store,product,num_sold
0,0.0,01-01-2015,Kerala,ExcelMart,Mec Mug,329.0
1,1.0,01-01-2015,Kerala,ExcelMart,Mec Hat,520.0
2,2.0,01-01-2015,Kerala,ExcelMart,Mec Sticker,146.0
3,3.0,01-01-2015,Kerala,MecStore,Mec Mug,572.0
4,4.0,01-01-2015,Kerala,MecStore,Mec Hat,911.0
...,...,...,...,...,...,...
24585,,,,,,
24586,,,,,,
24587,,,,,,
24588,,,,,,


In [None]:
data.dropna(inplace=True)

In [None]:
X = data.drop('num_sold', axis=1)
Y = data['num_sold']
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
categorical_features = ['date', 'state', 'store', 'product']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])


In [None]:
base_models = [
    ('rf', RandomForestRegressor(random_state=0)),
    ('xgb', xgb.XGBRegressor(random_state=0))
]

stacked_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=RandomForestRegressor(random_state=0)
)

boosted_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('stacked_regressor', stacked_regressor)
])

In [None]:
boosted_regressor.fit(X, Y)
# Y_pred = boosted_regressor.predict(X_test)

In [None]:
import pickle

with open('final_model.pkl', 'wb') as file:
    pickle.dump(boosted_regressor, file)

In [None]:
# mse = mean_squared_error(Y_test, Y_pred)
# print("Mean Squared Error:", mse)

# r2 = r2_score(Y_test, Y_pred)
# print("R2 Score:", r2)

In [None]:
test = pd.read_csv('dataset/test.csv')
prediction = boosted_regressor.predict(test)
prediction = np.round(prediction).astype(int)
prediction_df = pd.DataFrame({'row_id': test['row_id'] , 'num_sold': prediction})

prediction_df.to_csv('prediction.csv', index=False)