In [197]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Load the datasets
test_data_path = r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\teste2_FULL.csv'
train_data_path = r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\dados_completos - Copy.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [198]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [199]:
# Remove outliers
columns_to_clean = ['delivery_time', 'TimeToApprove', 'ApprovedToCarrier', 'Days_Delivery_CarrierToCustomer', 'distance']
for column in columns_to_clean:
    train_data = remove_outliers(train_data, column)

In [200]:
# Drop duplicates and missing values
train_data = train_data.drop_duplicates('order_id', keep='first').dropna(subset=columns_to_clean)


In [201]:
# Feature selection
categorical_features = ['product_category_name', 'seller_state']
numerical_features = ['TimeToApprove', 'ApprovedToCarrier', 'distance']
unwanted_columns = ['order_id', 'customer_id', 'order_purchase_timestamp', 'order_approved_at']

In [202]:
# Prepare the test data
filtered_test_data = test_data.drop(unwanted_columns, axis=1)
features_for_prediction = numerical_features + categorical_features
filtered_test_data_prepared = filtered_test_data[features_for_prediction]

In [203]:
# Preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [204]:
# Splitting the data
X = train_data[numerical_features + categorical_features]
y = train_data['delivery_time']

In [205]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [206]:
# Create the pipeline with XGBoost
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', xgb.XGBRegressor(use_label_encoder=False,
                                                     eval_metric='rmse'))])

In [207]:
# Train the model
pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [208]:
# Predict the delivery time for the test data
test_data['predicted_delivery_time'] = pipeline.predict(filtered_test_data_prepared)

In [209]:
# Save the predictions
test_data = test_data[['order_id', 'predicted_delivery_time']]
test_data.to_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 2\predictions4.csv', index=False)