In [106]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# Load the datasets
test_data_path = r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\teste2_FULL.csv'
train_data_path = r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\dados_completos - Copy.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [107]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [108]:
# Remove outliers
columns_to_clean = ['delivery_time', 'TimeToApprove', 'ApprovedToCarrier', 'Days_Delivery_CarrierToCustomer', 'distance']
for column in columns_to_clean:
    train_data = remove_outliers(train_data, column)

In [109]:
# Drop duplicates and missing values
train_data = train_data.drop_duplicates('order_id', keep='first').dropna(subset=columns_to_clean)

In [110]:
# Feature selection
categorical_features = ['product_category_name', 'seller_state']
numerical_features = ['TimeToApprove', 'ApprovedToCarrier', 'Days_Delivery_CarrierToCustomer', 'distance']
unwanted_columns = ['order_id', 'customer_id', 'order_purchase_timestamp', 'order_approved_at']

In [111]:
# Prepare the test data
filtered_test_data = test_data.drop(unwanted_columns, axis=1)
features_for_prediction = numerical_features + categorical_features
filtered_test_data_prepared = filtered_test_data[features_for_prediction]

In [112]:
# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [113]:
# Splitting the data
X = train_data[numerical_features + categorical_features]
y = train_data['delivery_time']


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [115]:
# Define the model pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor())])

In [116]:
# Define the grid of hyperparameters to search over
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

In [None]:
# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

In [None]:
# Evaluate the best model
best_model = grid_search.best_estimator_

In [None]:
# Predict the delivery time for the test data
test_data['predicted_delivery_time'] = best_model.predict(filtered_test_data_prepared)

In [None]:
# Save the predictions
test_data = test_data[['order_id', 'predicted_delivery_time']]
test_data.to_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\predictions3.csv', index=False)