In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


file_path = 'preprocessed_df.csv'  
df = pd.read_csv(file_path)

# Features and target
X = df.drop(['fraud', 'step', 'customer', 'zipcodeOri', 'zipMerchant', 'amount'], axis=1)
y = df['fraud']

# Original class distribution
print("Original class distribution:")
print(y.value_counts())

# Oversampling
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Class distribution after oversampling
print("\nClass distribution after SMOTE (oversampling):")
print(y_smote.value_counts())

# Save oversampled data
oversampled_data = pd.concat([X_smote, y_smote], axis=1)
oversampled_data.to_csv('oversampled_data.csv', index=False)

# Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# Class distribution after undersampling
print("\nClass distribution after RandomUnderSampler (undersampling):")
print(y_rus.value_counts())

# Save undersampled data
undersampled_data = pd.concat([X_rus, y_rus], axis=1)
undersampled_data.to_csv('undersampled_data.csv', index=False)

# Display the first few rows to verify
print("Oversampled Data (SMOTE):")
print(oversampled_data.head())

print("\nUndersampled Data (RUS):")
print(undersampled_data.head())


Original class distribution:
fraud
0    587443
1      7200
Name: count, dtype: int64

Class distribution after SMOTE (oversampling):
fraud
0    587443
1    587443
Name: count, dtype: int64

Class distribution after RandomUnderSampler (undersampling):
fraud
0    7200
1    7200
Name: count, dtype: int64
Oversampled Data (SMOTE):
   Unnamed: 0  age_'0'  age_'1'  age_'2'  age_'3'  age_'4'  age_'5'  age_'6'  \
0           0    False    False    False    False     True    False    False   
1           1    False    False     True    False    False    False    False   
2           2    False    False    False    False     True    False    False   
3           3    False    False    False     True    False    False    False   
4           4    False    False    False    False    False     True    False   

   age_'U'  gender_'E'  ...  merchant_'M840466850'  merchant_'M855959430'  \
0    False       False  ...                  False                  False   
1    False       False  ...         