In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [46]:
def find_missing_percent(data):
    """
    Returns dataframe containing the total missing values and percentage of total
    missing values of a column.
    """
    miss_df = pd.DataFrame({'ColumnName':[],'TotalMissingVals':[],'PercentMissing':[]})
    for col in data.columns:
        sum_miss_val = data[col].isnull().sum()
        percent_miss_val = round((sum_miss_val/data.shape[0])*100,2)
        miss_df.loc[len(miss_df)] = dict(zip(miss_df.columns,[col,sum_miss_val,percent_miss_val]))
    return miss_df

In [47]:
def calculate_outlier_percentage(series, threshold=1.5): # Function to calculate the percentage of outliers for a given series
    z_scores = np.abs((series - series.median()) / series.std())
    outliers = z_scores > threshold
    return (outliers.sum() / len(series)) * 100

In [48]:
def create_multiple_boxplots(data_frame, columns_for_boxplot, titles=None, num_boxplots_per_row=2):
    # Calcular a quantidade;
    num_boxplots = len(columns_for_boxplot)
    num_rows = (num_boxplots + num_boxplots_per_row - 1) // num_boxplots_per_row

    # Criar os subplots
    fig = make_subplots(rows=num_rows, cols=num_boxplots_per_row, subplot_titles=titles)

    # Loop para ir montando todos os gráficos em boxplot
    for idx, column in enumerate(columns_for_boxplot):
        row_idx = idx // num_boxplots_per_row + 1
        col_idx = idx % num_boxplots_per_row + 1

        data = data_frame[column]
        box = go.Box(y=data, name=column)

        fig.add_trace(box, row=row_idx, col=col_idx)

    # Ajustando a forma
    fig.update_layout(height=300*num_rows, showlegend=False)

    # Plotar os gráficos
    fig.show()


In [49]:
def find_correlated_columns(df, interval):
    """
    Encontra e exibe as correlações entre colunas de um DataFrame.

    Parâmetros:
    - df: DataFrame pandas
    - intervalo de correlação desejado (uma tupla de dois valores)

    Retorna:
    - Lista de tuplas representando pares de colunas correlacionadas.
    """
    correlation_matrix = df.corr(numeric_only=True)
    correlated_columns = []

    # Iterar sobre as combinações de colunas para encontrar correlações
    for i in range(len(correlation_matrix.columns)):
        for j in range(i + 1, len(correlation_matrix.columns)):
            corr = correlation_matrix.iloc[i, j]
            if interval[0] <= abs(corr) <= interval[1]:
                col1 = correlation_matrix.columns[i]
                col2 = correlation_matrix.columns[j]
                correlated_columns.append((col1, col2))
                print(f"Correlação entre {col1} e {col2}: {corr}")

    # Plotar um mapa de calor da matriz de correlação
    plt.figure(figsize=(20, 16))
    sns.heatmap(correlation_matrix, annot=True, cmap='cubehelix_r')
    plt.title('Matriz de Correlação')
    plt.xlabel('Variáveis')
    plt.ylabel('Variáveis')
    plt.show()

    return correlated_columns

In [50]:
def correlacao_com_variavel_alvo(df, target_variable, nivel="forte", top_n=5):
    """
    Imprime as n features com as maiores correlações com uma variável alvo, com base no nível escolhido.

    Parâmetros:
    - df: DataFrame pandas.
    - target_variable: String, nome da variável alvo.
    - nivel: String que define o critério de correlação ("forte", "fraca", etc.).
    - top_n: Número inteiro, quantidade de features a serem impressas.

    Retorna:
    - Nenhum (imprime as correlações).
    """
    correlation_matrix = df.corr(numeric_only=True)

    # Filtra as correlações com base no nível escolhido
    if nivel.lower() == "forte":
        filtered_correlations = correlation_matrix[((correlation_matrix >= 0.7) & (correlation_matrix < 1.0)) | ((correlation_matrix <= -0.7) & (correlation_matrix > -1.0))]
    else:
        raise ValueError("Nível não suportado. Atualmente, apenas 'forte' é suportado.")

    # Filtra as correlações com a variável alvo
    correlations_with_target = filtered_correlations[target_variable].sort_values(ascending=False)

    # Pegar as n maiores correlações
    top_n_correlations = correlations_with_target.head(top_n)

    # Imprimir as n maiores correlações com a variável alvo
    print(f"As {top_n} maiores correlações com '{target_variable}' ({nivel}):")
    for feature, correlation in top_n_correlations.items():
        print(f"{feature}: {correlation}")


In [51]:
df = pd.read_csv('orders.csv')
df

Unnamed: 0,order_id,store_id,channel_id,payment_order_id,delivery_order_id,order_status,order_amount,order_delivery_fee,order_delivery_cost,order_created_hour,...,order_moment_delivering,order_moment_delivered,order_moment_finished,order_metric_collected_time,order_metric_paused_time,order_metric_production_time,order_metric_walking_time,order_metric_expediton_speed_time,order_metric_transit_time,order_metric_cycle_time
0,68405119,3512,5,68405119,68405119,CANCELED,62,0,,0,...,,,,,,,,,,
1,68405123,3512,5,68405123,68405123,CANCELED,62,0,,0,...,,,,,,,,,,
2,68405206,3512,5,68405206,68405206,CANCELED,115,0,,0,...,,,,,,,,,,
3,68405465,3401,5,68405465,68405465,CANCELED,55,0,,0,...,,,,,,,,,,
4,68406064,3401,5,68406064,68406064,CANCELED,37,0,,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368994,79982448,1029,21,79982448,79982448,FINISHED,178,0,1.0,19,...,,,,,,1333.0,,478.0,0.0,1812.0
368995,79982515,95,5,79982515,79982515,FINISHED,22,4,5.0,19,...,,,,2.0,1.0,21.0,4.0,5.0,12.0,40.0
368996,79982545,3071,5,79982545,79982545,FINISHED,50,0,5.0,19,...,,,,3.0,14.0,9.0,6.0,20.0,12.0,43.0
368997,79982679,294,5,79982679,79982679,FINISHED,33,0,5.0,19,...,,,,0.0,0.0,4.0,3.0,4.0,11.0,20.0


In [54]:
df.shape

(368999, 29)

In [55]:
#df['payment_status'].value_counts()

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368999 entries, 0 to 368998
Data columns (total 29 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   order_id                           368999 non-null  int64  
 1   store_id                           368999 non-null  int64  
 2   channel_id                         368999 non-null  int64  
 3   payment_order_id                   368999 non-null  int64  
 4   delivery_order_id                  368999 non-null  int64  
 5   order_status                       368999 non-null  object 
 6   order_amount                       368999 non-null  int64  
 7   order_delivery_fee                 368999 non-null  int64  
 8   order_delivery_cost                361794 non-null  float64
 9   order_created_hour                 368999 non-null  int64  
 10  order_created_minute               368999 non-null  int64  
 11  order_created_day                  3689

In [57]:
df.describe()

Unnamed: 0,order_id,store_id,channel_id,payment_order_id,delivery_order_id,order_amount,order_delivery_fee,order_delivery_cost,order_created_hour,order_created_minute,...,order_moment_delivering,order_moment_delivered,order_moment_finished,order_metric_collected_time,order_metric_paused_time,order_metric_production_time,order_metric_walking_time,order_metric_expediton_speed_time,order_metric_transit_time,order_metric_cycle_time
count,368999.0,368999.0,368999.0,368999.0,368999.0,368999.0,368999.0,361794.0,368999.0,368999.0,...,0.0,0.0,0.0,317507.0,297594.0,343892.0,294943.0,334417.0,343142.0,353380.0
mean,82307260.0,1197.796124,7.837929,82307260.0,82307260.0,104.5811,5.645373,6.952653,16.719612,29.44121,...,,,,2.278769,8.704188,61.316233,4.283868,18.590523,46.361897,156.076368
std,7037735.0,1151.075389,8.298331,7037735.0,7037735.0,2953.113,5.918301,4.010998,6.095596,17.223663,...,,,,13.232635,65.721527,802.99429,17.580886,88.241182,752.578417,1224.987541
min,68405120.0,3.0,1.0,68405120.0,68405120.0,0.0,0.0,0.0,0.0,0.0,...,,,,-1.0,-2728.0,0.0,-1.0,0.0,-4.0,0.0
25%,76355220.0,415.0,5.0,76355220.0,76355220.0,39.0,0.0,5.0,15.0,15.0,...,,,,0.0,1.0,8.0,2.0,4.0,11.0,32.0
50%,83245990.0,707.0,5.0,83245990.0,83245990.0,71.0,6.0,7.0,17.0,29.0,...,,,,1.0,2.0,15.0,3.0,6.0,16.0,42.0
75%,88030550.0,1528.0,5.0,88030550.0,88030550.0,121.0,11.0,8.0,22.0,44.0,...,,,,3.0,5.0,22.0,5.0,10.0,24.0,55.0
max,93139820.0,4679.0,49.0,93139820.0,93139820.0,1788306.0,990.0,156.0,23.0,59.0,...,,,,2050.0,11712.0,85662.0,2735.0,11718.0,154261.0,154277.0


In [58]:
df.isnull().sum()

order_id                                  0
store_id                                  0
channel_id                                0
payment_order_id                          0
delivery_order_id                         0
order_status                              0
order_amount                              0
order_delivery_fee                        0
order_delivery_cost                    7205
order_created_hour                        0
order_created_minute                      0
order_created_day                         0
order_created_month                       0
order_created_year                        0
order_moment_created                 368999
order_moment_accepted                368999
order_moment_ready                   368999
order_moment_collected               368999
order_moment_in_expedition           368999
order_moment_delivering              368999
order_moment_delivered               368999
order_moment_finished                368999
order_metric_collected_time     

In [59]:
df.duplicated().sum()

0

In [60]:
miss_df = find_missing_percent(df)
'''Displays columns with missing values'''
display(miss_df[miss_df['PercentMissing']>0.0])
print("\n")
print(f"Number of columns with missing values:{str(miss_df[miss_df['PercentMissing']>0.0].shape[0])}")

Unnamed: 0,ColumnName,TotalMissingVals,PercentMissing
8,order_delivery_cost,7205,1.95
14,order_moment_created,368999,100.0
15,order_moment_accepted,368999,100.0
16,order_moment_ready,368999,100.0
17,order_moment_collected,368999,100.0
18,order_moment_in_expedition,368999,100.0
19,order_moment_delivering,368999,100.0
20,order_moment_delivered,368999,100.0
21,order_moment_finished,368999,100.0
22,order_metric_collected_time,51492,13.95




Number of columns with missing values:16


In [61]:
numeric_cols = df.select_dtypes(['float','int']).columns
for feature in numeric_cols:
    qtd_outliers = calculate_outlier_percentage(df[feature]).round(2)
    print(f'A quantidade de outliers em {feature} é: {qtd_outliers} %')

A quantidade de outliers em order_id é: 12.6 %
A quantidade de outliers em store_id é: 17.05 %
A quantidade de outliers em channel_id é: 7.67 %
A quantidade de outliers em payment_order_id é: 12.6 %
A quantidade de outliers em delivery_order_id é: 12.6 %
A quantidade de outliers em order_amount é: 0.02 %
A quantidade de outliers em order_delivery_fee é: 0.73 %
A quantidade de outliers em order_delivery_cost é: 7.77 %
A quantidade de outliers em order_created_hour é: 9.3 %
A quantidade de outliers em order_created_minute é: 14.63 %
A quantidade de outliers em order_created_day é: 11.85 %
A quantidade de outliers em order_created_month é: 20.33 %
A quantidade de outliers em order_created_year é: 0.0 %
A quantidade de outliers em order_moment_created é: 0.0 %
A quantidade de outliers em order_moment_accepted é: 0.0 %
A quantidade de outliers em order_moment_ready é: 0.0 %
A quantidade de outliers em order_moment_collected é: 0.0 %
A quantidade de outliers em order_moment_in_expedition é: 

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


A quantidade de outliers em order_moment_delivering é: 0.0 %
A quantidade de outliers em order_moment_delivered é: 0.0 %
A quantidade de outliers em order_moment_finished é: 0.0 %
A quantidade de outliers em order_metric_collected_time é: 0.47 %
A quantidade de outliers em order_metric_paused_time é: 0.82 %
A quantidade de outliers em order_metric_production_time é: 0.75 %
A quantidade de outliers em order_metric_walking_time é: 0.46 %
A quantidade de outliers em order_metric_expediton_speed_time é: 2.1 %
A quantidade de outliers em order_metric_transit_time é: 0.48 %
A quantidade de outliers em order_metric_cycle_time é: 1.16 %
