### Step 1: Data Cleaning (Python / SQL)

In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("ecommerce_data.csv")

# Drop duplicates and missing values
df.drop_duplicates(inplace=True)
df.dropna(subset=['ProductID', 'Category', 'ReturnFlag'], inplace=True)

# Standardize column names
df.columns = df.columns.str.strip().str.lower()

### Step 2: Analyze Return % (Python)

In [None]:
# Return rate per category
category_returns = df.groupby('category')['returnflag'].mean().reset_index()
category_returns.columns = ['category', 'return_rate']
print(category_returns)

# Return rate per supplier
supplier_returns = df.groupby('supplier')['returnflag'].mean().reset_index()
supplier_returns.columns = ['supplier', 'return_rate']
print(supplier_returns)

         category  return_rate
0         Apparel     0.200000
1          Beauty     0.224490
2     Electronics     0.348837
3  Home & Kitchen     0.216216
4          Sports     0.277778
    supplier  return_rate
0  SupplierA     0.148936
1  SupplierB     0.261905
2  SupplierC     0.311111
3  SupplierD     0.287879


In [None]:
category_returns = df.groupby('category')['returnflag'].mean()
supplier_returns = df.groupby('supplier')['returnflag'].mean()

### Step 3: Predict Return Probability (Logistic Regression)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report

# Feature Engineering
features = df[['category', 'region', 'marketingchannel', 'price', 'quantity']]
target = df['returnflag']

# One-hot encoding for categorical features
features_encoded = pd.get_dummies(features)

# Split data
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Add return risk score to original data
df['return_risk_score'] = model.predict_proba(features_encoded)[:,1]

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        27
           1       0.00      0.00      0.00        13

    accuracy                           0.68        40
   macro avg       0.34      0.50      0.40        40
weighted avg       0.46      0.68      0.54        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
model.predict_proba(features_encoded)[:, 1]

array([0.17619777, 0.24501004, 0.21948847, 0.27276412, 0.26513456,
       0.14237094, 0.45148549, 0.23248181, 0.21858272, 0.19378506,
       0.28575316, 0.19394147, 0.19716559, 0.34919731, 0.1810784 ,
       0.18006362, 0.36983195, 0.16157819, 0.16933468, 0.2295862 ,
       0.18947333, 0.21796479, 0.28293448, 0.16955661, 0.1785513 ,
       0.10914687, 0.4019209 , 0.25257234, 0.11675247, 0.26160305,
       0.34782323, 0.32659467, 0.22416935, 0.21329347, 0.14390412,
       0.13382068, 0.14699963, 0.45204798, 0.29357124, 0.19003947,
       0.28974567, 0.11559172, 0.18570855, 0.11284231, 0.38437314,
       0.42322024, 0.17092412, 0.14432664, 0.12460843, 0.12794632,
       0.34311363, 0.25252993, 0.20908718, 0.1321266 , 0.4459766 ,
       0.3701017 , 0.182214  , 0.16463694, 0.13264434, 0.34166895,
       0.21948369, 0.36734622, 0.1192728 , 0.20542284, 0.2170624 ,
       0.33206628, 0.24698689, 0.30668415, 0.42783167, 0.12000036,
       0.42699788, 0.30188781, 0.25766448, 0.15084352, 0.17991

In [None]:
high_risk = df[df['return_risk_score'] > 0.7]
high_risk[['productid', 'return_risk_score']].to_csv('high_risk_products.csv', index=False)

In [None]:
df['return_risk_score'] = model.predict_proba(features_encoded)[:,1]
high_risk = df[df['return_risk_score'] > 0.7]
high_risk.to_csv('high_risk_products.csv', index=False)