In [12]:
# %pip install pandas scikit-learn scikit-learn-intelex
# %pip install xgboost

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import pandas as pd
import xgboost as xgb
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [14]:
data = pd.read_csv("dataset/train.csv")
data

Unnamed: 0,row_id,date,state,store,product,num_sold
0,0.0,01-01-2015,Kerala,ExcelMart,Mec Mug,329.0
1,1.0,01-01-2015,Kerala,ExcelMart,Mec Hat,520.0
2,2.0,01-01-2015,Kerala,ExcelMart,Mec Sticker,146.0
3,3.0,01-01-2015,Kerala,MecStore,Mec Mug,572.0
4,4.0,01-01-2015,Kerala,MecStore,Mec Hat,911.0
...,...,...,...,...,...,...
24585,,,,,,
24586,,,,,,
24587,,,,,,
24588,,,,,,


In [15]:
data.dropna(inplace=True)

In [16]:
X = data.drop('num_sold', axis=1)
Y = data['num_sold']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [17]:
categorical_features = ['state', 'store', 'product']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])


In [18]:
base_models = [
    ('rf', RandomForestRegressor(random_state=0)),
    ('xgb', xgb.XGBRegressor(random_state=0))
]

# Define the stacking regressor with the base models
stacked_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=RandomForestRegressor(random_state=0)  # Meta-model
)

# Create a pipeline
boosted_regressor = Pipeline([
    ('preprocessor', preprocessor),
    ('stacked_regressor', stacked_regressor)
])

In [19]:
# rf_regressor = RandomForestRegressor(random_state=0)

# # Define the XGBoost model
# xgb_regressor = xgb.XGBRegressor(random_state=0)

# # Combine the RandomForestRegressor and XGBoost models in a VotingRegressor
# boosted_regressor = Pipeline(steps=[('preprocessor', preprocessor),
#                                     ('ensemble', VotingRegressor([('rf', rf_regressor), ('xgb', xgb_regressor)]))])


In [20]:
boosted_regressor.fit(X_train, Y_train)
Y_pred = boosted_regressor.predict(X_test)

In [21]:
import pickle

# Save the trained model to a file
with open('model3.pkl', 'wb') as file:
    pickle.dump(boosted_regressor, file)

In [22]:
mse = mean_squared_error(Y_test, Y_pred)
print("Mean Squared Error:", mse)

r2 = r2_score(Y_test, Y_pred)
print("R2 Score:", r2)

Mean Squared Error: 13559.944094361248
R2 Score: 0.8072963378962021
