In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("../data/creditcard.csv")  # Adjust path if needed
print(df.shape)
df.head()


# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)


# Drop 'Time' as it's not useful
df = df.drop(columns=["Time"])


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df["Amount"] = scaler.fit_transform(df[["Amount"]])


from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X, y = df.drop(columns=["Class"]), df["Class"]
X_resampled, y_resampled = smote.fit_resample(X, y)

print("New class distribution:\n", y_resampled.value_counts())

# Save the cleaned and balanced dataset
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled["Class"] = y_resampled

df_resampled.to_csv("../data/cleaned_creditcard.csv", index=False)
print("Cleaned dataset saved!")




(284807, 31)
Missing Values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
New class distribution:
 Class
0    284315
1    284315
Name: count, dtype: int64
Cleaned dataset saved!
