### CODIGO SQL

In [None]:
"""
SELECT
    o.order_id,
    (date_diff('second', o.order_purchase_timestamp, o.order_approved_at) / 86400.0) AS TimeToApprove,
    (date_diff('second', o.order_approved_at, o.order_delivered_carrier_date) / 86400.0) AS ApprovedToCarrier,
    (date_diff('second', o.order_delivered_carrier_date, o.order_delivered_customer_date) / 86400.0) AS CarrierToCustomer,
    o.delivery_time
    
FROM
    orders o
"""


### MATCHES

In [None]:
"""
import pandas as pd

DATA = r"C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\dadosLimitados.csv"
df = pd.read_csv(DATA)

df_test = pd.read_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\treinamento-ciencia-de-dados-24-1\test.csv')

# Step 1: Match on 'order_id'
matches_order_id = pd.merge(df, df_test, on='order_id', how='inner')

# Step 2: Match on 'customer_id'
matches_customer_id = pd.merge(df, df_test, on='customer_id', how='inner')

# Step 3: Match 'customer_id' from df_test with 'customer_unique_id' from df
matches_customer_unique = pd.merge(df, df_test, left_on='customer_unique_id', right_on='customer_id', how='inner')

# Display the matching rows for each case
print("Matches on order_id:")
print(matches_order_id)
print("\nMatches on customer_id:")
print(matches_customer_id)
print("\nMatches on customer_id from df_test with customer_unique_id from df:")
print(matches_customer_unique)
"""

### LIMPEZA

In [1]:
import pandas as pd

DATA_URL = r"C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 1\TREINO.csv"
df = pd.read_csv(DATA_URL)

def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

columns_to_clean = ['delivery_time', 'TimeToApprove', 'ApprovedToCarrier', 'CarrierToCustomer']
for column in columns_to_clean:
    df = remove_outliers(df, column)
    
df.to_csv(r"C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 1\TREINO_limpo.csv", index=False)

# MODELO

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression

train_data = pd.read_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\Modelo 1\TREINO_limpo.csv')

# Select features and target
features = ['TimeToApprove', 'ApprovedToCarrier', 'CarrierToCustomer']
target = 'delivery_time'

X_train = train_data[features]
y_train = train_data[target]

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Load the testing data
test_data = pd.read_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\Data\test.csv')

# Convert timestamps to datetime
test_data['order_purchase_timestamp'] = pd.to_datetime(test_data['order_purchase_timestamp'])
test_data['order_approved_at'] = pd.to_datetime(test_data['order_approved_at'])

# Compute TimeToApprove
test_data['TimeToApprove'] = (test_data['order_approved_at'] - test_data['order_purchase_timestamp']).dt.total_seconds() / (60 * 60 * 24)

# Use historical averages for ApprovedToCarrier and CarrierToCustomer
# Calculate these averages from the training data
avg_approved_to_carrier = train_data['ApprovedToCarrier'].mean()
avg_carrier_to_customer = train_data['CarrierToCustomer'].mean()

# Add these averages to the test data
test_data['ApprovedToCarrier'] = avg_approved_to_carrier
test_data['CarrierToCustomer'] = avg_carrier_to_customer

# Select features for prediction
X_test = test_data[['TimeToApprove', 'ApprovedToCarrier', 'CarrierToCustomer']]

# Predict delivery times for the test data
test_data['predicted_delivery_time'] = model.predict(X_test)

# Select only the 'order_id' and 'predicted_delivery_time' columns
output_data = test_data[['order_id', 'predicted_delivery_time']]

# Save the selected data to a CSV file
output_data.to_csv(r'C:\Users\gabri\Documents\PROJETOS\PY\PJ_Code\DE\predictions.csv', index=False)