In [1]:
from helpermodules.memory_handling import PickleHelper
import pandas as pd

In [None]:
merged_df = PickleHelper.pickle_load(filename).obj

In [None]:
# applying data cleaning to new csv file
merged_df.to_csv('olist_merged_data.csv', index=False)
merged_df.info()

In [None]:
# check for duplicates
merged_df.duplicated().sum()

In [None]:

# check for missing values by percentage in each column
merged_df.isnull().sum() / len(merged_df) * 100

In [None]:

# drop missing values column with more than 50% missing values
merged_df = merged_df.dropna(thresh=len(merged_df) * 0.5, axis=1)

# drop rows with missing values
merged_df = merged_df.dropna()

# check for missing values by percentage in each column
merged_df.info()


In [None]:

# Clean and preprocess data
def preprocess_data(df):
    # Drop columns with more than 50% missing values
    df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)
    
    # Convert datetime columns
    datetime_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 
                    'order_delivered_customer_date', 'order_estimated_delivery_date', 
                    'shipping_limit_date', 'review_creation_date', 'review_answer_timestamp']
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Calculate new features
    df['time_to_delivery'] = (df['order_delivered_customer_date'] - df['order_approved_at']).dt.days
    df['order_processing_time'] = (df['order_approved_at'] - df['order_purchase_timestamp']).dt.days
    df['estimated_vs_actual_shipping'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date']).dt.days
    df['product_volume_m3'] = (df['product_length_cm'] * df['product_width_cm'] * df['product_height_cm']) / 1000000
    df['satisfaction'] = (df['review_score'] >= 4).astype(int)
    df['order_value'] = df['price'] + df['freight_value']

    # create late delivery flag
    df['late_delivery'] = (df['order_delivered_customer_date'] > df['order_estimated_delivery_date']).astype(int)


    # Drop rows with missing values
    df.dropna(inplace=True)

    # create seasonal features from order_purchase_timestamp
    df['order_month'] = df['order_purchase_timestamp'].dt.month
    df['order_day'] = df['order_purchase_timestamp'].dt.dayofweek
    df['order_hour'] = df['order_purchase_timestamp'].dt.hour

    return df


In [None]:

merged_df = preprocess_data(merged_df)

In [None]:

# drop unnecessary columns
merged_df.drop(['product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'review_score', 'seller_zip_code_prefix']
               , axis=1, inplace=True) 


In [None]:
# save the cleaned dataset
merged_df.to_csv('olist_merged_data_clean.csv', index=False)

In [None]:
merged_df.info()

In [None]:
# check summary statistics
merged_df.describe()

In [None]:
# Check the distribution of the CSAT percentage
merged_df['satisfaction'].value_counts() / len(merged_df) * 100

### Correlation 

### Correlation for numerical values (Pearson)

In [None]:
# Calculate the Pearson correlation matrix
correlation_matrix = df.corr(method='pearson')

# Plot the heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, linewidths=0.5)

# Show the plot
plt.title('Correlation Matrix (Pearson)')
plt.show()

### Correlation for categorical values (Cramer's V)

In [None]:
# Function to calculate Cramér's V
def cramers_v(x, y):
    # Create a contingency table
    contingency_table = pd.crosstab(x, y)
    
    # Perform Chi-Square test
    chi2, _, _, _ = chi2_contingency(contingency_table)
    
    # Calculate Cramér's V
    n = contingency_table.sum().sum()
    r, k = contingency_table.shape
    return np.sqrt(chi2 / (n * (min(r-1, k-1))))

df = pd.DataFrame(data)

# List of categorical columns
categorical_columns = ['list of categorical column']

# Create an empty matrix to store Cramér's V values
n = len(categorical_columns)
cramers_v_matrix = pd.DataFrame(np.zeros((n, n)), index=categorical_columns, columns=categorical_columns)

# Calculate Cramér's V for each pair of variables
for col1 in categorical_columns:
    for col2 in categorical_columns:
        cramers_v_matrix.loc[col1, col2] = cramers_v(df[col1], df[col2])

# Plot the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cramers_v_matrix, annot=True, cmap='coolwarm', vmin=0, vmax=1, linewidths=0.5)
plt.title("Cramér's V Correlation Matrix")
plt.show()

### Min-Max Data Stardadization

In [None]:
def data_scaler(df, columns_to_scale): 
    # Initialize the MinMaxScaler
    scaler = MinMaxScaler()
    
    # Apply the Min-Max scaling (normalization) to the dataset
    df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=columns_to_scale)

    return df_scaled

In [None]:
# Select columns to scale
columns_to_scale = ['list of columns']

# Scale Data
df_scaled = data_scaler(df, columns_to_scale)

# Display the normalized dataset
print("Original Data:\n", df)
print("\nNormalized Data:\n", df_scaled)

## Initial Screening

### Variance test (numerical values)

In [None]:
# Apply VarianceThreshold with a threshold of 0.062
selector = VarianceThreshold(threshold=0.062)
selected_features = selector.fit_transform(df)

# Get the column names of the selected features
selected_columns = df.columns[selector.get_support()]

# Create a new DataFrame with selected features
df_selected = pd.DataFrame(selected_features, columns=selected_columns)

# Display the selected features
print("Selected Features Based on Variance Threshold (0.062):")
print(df_selected)

### Chi-square test (categorical values)

In [None]:
# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Define the target variable
target = 'class'

# List to store Chi-Square values
chi2_values = []
features = []

# Iterate over categorical columns (excluding the target)
for column in df.columns:
    if column != target:
        # Create a contingency table
        contingency_table = pd.crosstab(df[column], df[target])
        
        # Perform the Chi-Square test
        chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

        # Store the Chi-Square value and feature name
        chi2_values.append(chi2)
        features.append(column)

        # Display the results for each categorical variable
        print(f"\nChi-Square Test Results for '{column}':")
        print(f"Contingency Table:\n{contingency_table}")
        print(f"Chi-Square Statistic: {chi2}")
        print(f"P-Value: {p}")
        print(f"Degrees of Freedom: {dof}")
        print("Expected Frequencies:\n", expected)

# Plotting the Chi-Square values
plt.figure(figsize=(8, 5))
plt.bar(features, chi2_values, color='skyblue')
plt.title('Chi-Square Values for Categorical Features')
plt.xlabel('Features')
plt.ylabel('Chi-Square Value')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

# Interpretation
alpha = 0.05  # Significance level
for i, p in enumerate([stats.chi2_contingency(pd.crosstab(df[column], df[target]))[1] for column in features]):
    if p < alpha:
        print(f"\nReject the null hypothesis for '{features[i]}': There is a significant association.")
    else:
        print(f"\nFail to reject the null hypothesis for '{features[i]}': There is no significant association.")

## Secondary Screening

### Onehot Encoder

In [None]:
# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to avoid multicollinearity

# Fit and transform the categorical variables
encoded_features = encoder.fit_transform(df[['incident_type', 'collision_type']])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['incident_type', 'collision_type']))

# Concatenate the original numerical data with the one-hot encoded DataFrame
df_encoded = pd.concat([df.drop(['incident_type', 'collision_type'], axis=1), encoded_df], axis=1)

# Display the DataFrame after one-hot encoding
print("\nDataFrame after One-Hot Encoding:")
print(df_encoded)

### Fisher Scores

In [None]:
def fisher_score(df, feature, target):
    classes = df[target].unique()
    mean = []
    variance = []
    
    for cls in classes:
        mean.append(df[df[target] == cls][feature].mean())
        variance.append(df[df[target] == cls][feature].var(ddof=0))  # Population variance
    
    # Fisher score calculation
    fisher_score = (mean[0] - mean[1])**2 / (variance[0] + variance[1])
    return fisher_score

# Calculate Fisher scores for each feature
fisher_scores = {}
for feature in df.columns[:-1]:  # Exclude the target column
    fisher_scores[feature] = fisher_score(df, feature, 'class')

# Create a DataFrame to display Fisher scores
fisher_scores_df = pd.DataFrame(list(fisher_scores.items()), columns=['Feature', 'Fisher Score'])

# Display the Fisher scores
print("Fisher Scores for Customer Features:")
print(fisher_scores_df)