In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn import set_config
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import pickle

In [15]:
set_config(display='diagram')

In [3]:
df = pd.read_csv('../data/raw/SampleSuperstore.csv')

In [4]:
df

Unnamed: 0,Ship Mode,Segment,Country,City,State,Postal Code,Region,Category,Sub-Category,Sales,Quantity,Discount,Profit
0,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Bookcases,261.9600,2,0.00,41.9136
1,Second Class,Consumer,United States,Henderson,Kentucky,42420,South,Furniture,Chairs,731.9400,3,0.00,219.5820
2,Second Class,Corporate,United States,Los Angeles,California,90036,West,Office Supplies,Labels,14.6200,2,0.00,6.8714
3,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Furniture,Tables,957.5775,5,0.45,-383.0310
4,Standard Class,Consumer,United States,Fort Lauderdale,Florida,33311,South,Office Supplies,Storage,22.3680,2,0.20,2.5164
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,Second Class,Consumer,United States,Miami,Florida,33180,South,Furniture,Furnishings,25.2480,3,0.20,4.1028
9990,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Furniture,Furnishings,91.9600,2,0.00,15.6332
9991,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Technology,Phones,258.5760,2,0.20,19.3932
9992,Standard Class,Consumer,United States,Costa Mesa,California,92627,West,Office Supplies,Paper,29.6000,4,0.00,13.3200


In [5]:
df['Postal Code'] = df['Postal Code'].astype('object')

In [6]:
numeric_data = df.loc[:,'Ship Mode':'Discount'].select_dtypes(include=['int64', 'float64']).columns
categoric_data = df.loc[:,'Ship Mode':'Discount'].select_dtypes(include=['object']).columns

In [8]:
x = df.drop(['Profit'], axis=1)
y = df['Profit']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

print('Amostras de treino:')
print(f' * X_train: {x_train.shape}')
print(f' * y_train: {y_train.shape}')

print('Amostras de teste:')
print(f' * X_test: {x_test.shape}')
print(f' * y_test: {y_test.shape}')

Amostras de treino:
 * X_train: (6995, 12)
 * y_train: (6995,)
Amostras de teste:
 * X_test: (2999, 12)
 * y_test: (2999,)


In [10]:
transformers=[('cat_scale', OneHotEncoder(handle_unknown='ignore'), categoric_data),
             ('num_scale', MinMaxScaler(), numeric_data)]

preprocessor = ColumnTransformer(transformers=transformers)

In [16]:
rf_steps = [('preprocessor', preprocessor),
        ('rf_model', RandomForestRegressor(random_state=42))]

rf_model = Pipeline(steps=rf_steps, verbose=1)
rf_model

In [17]:
rf_model.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 2) Processing preprocessor, total=   0.0s
[Pipeline] .......... (step 2 of 2) Processing rf_model, total=  22.7s


In [18]:
y_pred = rf_model.predict(x_test)

In [21]:
print('Mean Absolute Error (MAE):', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Mean Absolute Percentage Error (MAPE):', mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error (MAE): 25.893921115371793
Mean Squared Error (MSE): 41168.526416329165
Root Mean Squared Error (RMSE): 202.9002868808449
Mean Absolute Percentage Error (MAPE): 1126639767620087.8


In [23]:
file_name = "../models/model_v0.pkl"
pickle.dump(rf_model, open(file_name, "wb"))