# Data Transformation

In [6]:
import pandas as pd

In [7]:
# load the first 500,000 rows of the csv as a pandas dataframe

file_path = "../data/raw/PS_20174392719_1491204439457_log.csv"
df_sample = pd.read_csv(file_path, nrows=500000)

In [8]:
df_sample.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Despite not having any null values, the previous method call does give us the names of all columns.

During the eda phase, we learned that certain columns didn't give any useful information for analysis (step, nameOrig, nameDest) and certain columns didn't contribute to useful visualizations (newbalanceOrig, oldbalanceDest, newbalanceDest). The visualizations also showed how similar the distributions between oldbalanceOrg and newbalanceOrig, and oldbalanceDest and newbalanceDest are, suggesting some of the columns are redundant, but may still be useful.

In [9]:
selected = ["step", "nameOrig", "nameDest"]
sample_drop = df_sample.drop(columns=selected)

In [10]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

chunk_size = 5000  # Define your chunk size based on your system"s memory capacity
processed_chunks = []  # List to store processed chunks
# Define numerical and categorical features
numeric_features = ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest", "isFraud"]
categorical_features = ["type"]  # Adjust based on your actual categorical features
# Define the preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(), categorical_features)])
# Fit the preprocessor on the sample
preprocessor.fit(df_sample)
for chunk in pd.read_csv(file_path, chunksize=chunk_size):
    # Apply the preprocessing to each chunk
    X_chunk = chunk.drop("isFraud",axis=1)
    y_chunk = chunk["isFraud"]
    processed_X_chunk = preprocessor.transform(chunk)
    # Convert the processed chunk back to DataFrame (optional, if you need a DataFrame for further processing)
    # Note: Adjust column names in `processed_chunk_df` based on the output of your preprocessor
    numeric_features_processed = [f"{feature}_scaled" for feature in numeric_features]
    categorical_features_encoded = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_features)
    all_features = numeric_features_processed + list(categorical_features_encoded)
    processed_chunk_df = pd.DataFrame(processed_X_chunk, columns=all_features)
    processed_chunk_df["isFraud"] = y_chunk.reset_index(drop=True)
    # Store the processed DataFrame
    processed_chunks.append(processed_chunk_df)
# Concatenate all processed chunks back into a single DataFrame
processed_df = pd.concat(processed_chunks, ignore_index=True)
processed_df.drop("isFraud_scaled",axis=1,inplace=True)
processed_df.to_csv("../data/processed/process_data_chunk.csv",index=False)