In [None]:
import pandas as pd
from helpermodules.memory_handling import PickleHelper

In [None]:
merged_df = PickleHelper.pickle_load(filename).obj
            

# Univariate analysis
- Plotting sales
- Sales based on state
- Payment type
- CSAT percentages
- Average time delivery
- CSAT and delivery time

### Plotting sales

In [None]:
# Ensure that the order_purchase_timestamp is in datetime format
merged_df['order_purchase_timestamp'] = pd.to_datetime(merged_df['order_purchase_timestamp'])

# Extract year and month for aggregation
merged_df['year_month'] = merged_df['order_purchase_timestamp'].dt.to_period('M')

# Aggregate data by year and month
sales_trends = merged_df.groupby('year_month').size()

# Plotting
plt.figure(figsize=(15, 6))
sales_trends.plot(kind='line', marker='o')
plt.title('Sales Trends Over Time (Monthly)')
plt.xlabel('Year-Month')
plt.ylabel('Number of Orders')
plt.grid(True)
plt.show()

### Sales based on state

In [None]:
# Grouping data by customer state
state_sales = merged_df.groupby('customer_state').size().sort_values(ascending=False)

# Plotting sales distribution by state
plt.figure(figsize=(15, 6))
state_sales.plot(kind='bar')
plt.title('Geographic Distribution of Sales by State')
plt.xlabel('State')
plt.ylabel('Number of Orders')
plt.xticks(rotation=45)
plt.show()

### Payment type

In [None]:
# payment type
merged_df['payment_type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Payment Type Distribution')
plt.show()

In [None]:
# Distribution of CSAT percentages on pie chart
plt.figure(figsize=(12, 6))
merged_df['satisfaction'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of CSAT Scores')
plt.show()

# Bivariate Analysis
- Total order value by category
- Control Chart for Daily Average Time to Delivery
- Identifying top 10 features with highest correlation with 'satisfaction'

### Total order value by category

In [None]:
# Calculate the total order value by category
order_value_by_category = merged_df.groupby('product_category_name_english')['order_value'].sum()
total_order_value = order_value_by_category.sum()
order_value_by_category_percent = (order_value_by_category / total_order_value).cumsum() * 100

# Correcting the approach for the Pareto Chart of Order Value by Product Category

# Sorting the order values by category in descending order
sorted_order_value_by_category = order_value_by_category.sort_values(ascending=False)
cumulative_order_value_percent = sorted_order_value_by_category.cumsum() / total_order_value * 100

# Recreating the Pareto Chart with the correct approach
fig, ax = plt.subplots(figsize=(14, 9))
sorted_order_value_by_category.plot(kind='bar', ax=ax, color='green')
ax2 = ax.twinx()
ax2.plot(cumulative_order_value_percent.index, cumulative_order_value_percent.values, color='red', marker='D', ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())

# Adding lines for the 80% threshold
ax2.axhline(80, color='green', linestyle='--', linewidth=2)
category_80_idx_corrected = cumulative_order_value_percent[cumulative_order_value_percent >= 80].index[0]
category_80_position_corrected = cumulative_order_value_percent.index.get_loc(category_80_idx_corrected)

ax.axvline(category_80_position_corrected, color='purple', linestyle='--', linewidth=2)

ax.tick_params(axis='y', colors='orange')
ax2.tick_params(axis='y', colors='red')
ax.set_xlabel('Product Category')
ax.set_ylabel('Total Order Value', color='orange')
ax2.set_ylabel('Cumulative Percentage', color='red')
plt.title('Pareto Chart of Order Value by Product Category with 80/20 Threshold')

plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

Pareto Chart of Order Value by Product Category with 80/20 Threshold: A small number of categories generate most of the revenue, showing market focus areas or best-sellers.Â¶

We can use above insight in targeted Marketing efforts by :
Allocate more budget to advertise the top-performing product categories that contribute most to your sales volume to maximize ROI.
Create bundles or promotions that include high-value items with other products to increase the overall order value.

### Control Chart for Daily Average Time to Delivery

In [None]:
from matplotlib import dates as mdates

# Preparing data for Control Chart: Calculate daily average time to delivery
daily_delivery_times = merged_df.copy()
daily_delivery_times['order_approved_at'] = pd.to_datetime(daily_delivery_times['order_approved_at'])
daily_delivery_times.set_index('order_approved_at', inplace=True)
daily_avg_delivery_time = daily_delivery_times['time_to_delivery'].resample('D').mean().dropna()

# Control Chart calculations
mean_delivery_time = daily_avg_delivery_time.mean()
std_dev_delivery_time = daily_avg_delivery_time.std()
upper_control_limit = mean_delivery_time + (std_dev_delivery_time * 3)
lower_control_limit = mean_delivery_time - (std_dev_delivery_time * 3)

# Plotting the Control Chart
fig, ax = plt.subplots(figsize=(14, 8))
daily_avg_delivery_time.plot(ax=ax, marker='o', linestyle='-', color='blue', markersize=4)
ax.axhline(mean_delivery_time, color='green', linestyle='--')
ax.axhline(upper_control_limit, color='red', linestyle='--')
ax.axhline(lower_control_limit, color='red', linestyle='--')

# Formatting the plot
ax.set_title('Control Chart for Daily Average Time to Delivery')
ax.set_xlabel('Date')
ax.set_ylabel('Average Time to Delivery (Days)')
ax.legend(['Daily Avg Time to Delivery', 'Mean', 'Upper Control Limit', 'Lower Control Limit'])
ax.xaxis.set_major_locator(mdates.MonthLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Identifying top 10 features with highest correlation with 'satisfaction'


In [None]:
# Identifying top 10 features with highest correlation with 'satisfaction'
# Select only the numeric columns for correlation calculation
numeric_cols = merged_df.select_dtypes(include=[np.number])

# Compute the correlation matrix for numeric columns only
corr_matrix = numeric_cols.corr()

# Print the top 10 features correaltion score
print(corr_matrix['satisfaction'].sort_values(ascending=False)[1:-1])

In [None]:
# Set the correlation threshold
threshold = 0.05

# Get the features with correlation greater than 7% or less than -7% with 'satisfaction'
high_corr_features = corr_matrix.index[(corr_matrix['satisfaction'].abs() > threshold) & (corr_matrix.index != 'satisfaction')].tolist()

# Print the highly correlated features
print(high_corr_features)

# check data types for top 10 features
merged_df[high_corr_features].dtypes



In [None]:
# need to take only 5 features
top_4_features = ['payment_value', 'time_to_delivery', 'estimated_vs_actual_shipping', 'late_delivery']
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Selecting only the top 6 features
top_6_features = ['estimated_vs_actual_shipping', 'order_month', 'order_hour', 'price', 'payment_sequential', 'order_value', 'payment_installments']
X = merged_df[top_4_features]
y = merged_df['satisfaction']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Applying ColumnTransformer to preprocess the data
preprocessor = ColumnTransformer(
    transformers=[('num', numerical_transformer, top_4_features)]
)

# Preprocessing the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=50),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10, min_samples_split=10, min_samples_leaf=4),
    'XGBoost': xgb.XGBClassifier(random_state=42)
}

# Function to fit models, make predictions, and evaluate them
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    # Plotting confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()
    
    print(f"{model_name} Classification Report:")
    print(class_report)

# Evaluate each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}")
    evaluate_model(model, X_train_preprocessed, y_train, X_test_preprocessed, y_test, model_name)

