In [None]:
!pip install pandas pyarrow fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

folder_path = "/content/drive/My Drive/GenAI/"

Mounted at /content/drive


In [None]:
import pandas as pd

# Read a Parquet file
dataset_path = folder_path + "data/unsupervised learning/dataset.parquet"
print(os.path.exists(dataset_path))

df = pd.read_parquet(dataset_path)  # Uses pyarrow or fastparquet

# Display the first few rows
print(df.head())

True
           ssn            cc_num    first   last gender      city state  \
0  367-85-9826  4361337605230458  Kristie  Davis      F  Chandler    OK   
1  367-85-9826  4361337605230458  Kristie  Davis      F  Chandler    OK   
2  367-85-9826  4361337605230458  Kristie  Davis      F  Chandler    OK   
3  367-85-9826  4361337605230458  Kristie  Davis      F  Chandler    OK   
4  367-85-9826  4361337605230458  Kristie  Davis      F  Chandler    OK   

     zip  city_pop                     job         dob      acct_num  \
0  74834      7590  Chief Strategy Officer  1987-06-12  349734538563   
1  74834      7590  Chief Strategy Officer  1987-06-12  349734538563   
2  74834      7590  Chief Strategy Officer  1987-06-12  349734538563   
3  74834      7590  Chief Strategy Officer  1987-06-12  349734538563   
4  74834      7590  Chief Strategy Officer  1987-06-12  349734538563   

                          trans_num  trans_date trans_time   unix_time  \
0  c036244703adb9d5392f4027d9d4b38d  

# ISOLATION FOREST
Useful for fraud detection purposes and is unsupervised learning

In [None]:
from sklearn.ensemble import IsolationForest

# Select relevant features
features = ["gender", "amt", "unix_time", "category", "merchant", "city_pop"]
df_selected = df[features]

# Encode categorical features
df_selected = pd.get_dummies(df_selected)

# Train Isolation Forest
# Assigning random state to give same results everytime
model = IsolationForest(contamination=0.4, random_state=42)  # 2% expected fraud
model.fit(df_selected)

# Predict fraud scores (-1 = anomaly, 1 = normal)
df["fraud_score"] = model.predict(df_selected)
df["fraud_detected_isoforest"] = (df["fraud_score"] == -1).astype(int)  # Convert to 0/1

In [None]:
from datetime import datetime
date_str = "2024-03-23 00:30:00"
unix = datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S").timestamp()

new_transaction = pd.DataFrame([{
    "gender": "M",
    "amt": 500,
    "unix_time": unix,
    "category": "Entertainment ej eoijeo ijoeij oiejoij ",
    "merchant": "Amazon",
    "city_pop": 5000000000
}])

new_transaction_encoded = pd.get_dummies(new_transaction)

# Ensure all columns match the training dataset
missing_cols = set(df_selected.columns) - set(new_transaction_encoded.columns)

missing_df = pd.DataFrame(0, index=new_transaction_encoded.index, columns=list(missing_cols))
new_transaction_encoded = pd.concat([new_transaction_encoded, missing_df], axis=1)


# Reorder columns to match training data
new_transaction_encoded = new_transaction_encoded[df_selected.columns]

In [None]:
# Predict fraud score (-1 = fraud, 1 = normal)
fraud_score = model.predict(new_transaction_encoded)[0]

# Convert to readable format
fraud_detected = 1 if fraud_score == -1 else 0

print("Fraud Detected:", fraud_score, fraud_detected)

Fraud Detected: -1 1


In [None]:
import joblib

# Save the trained Isolation Forest model
model_path = folder_path + "data/unsupervised learning/isolation_forest_model.joblib"
print(os.path.exists(model_path))

joblib.dump(model, model_path)
print("Model saved successfully!")

False
Model saved successfully!


# AUTO ENCODER
Deep learning unsupervised model

In [None]:
!pip install tensorflow keras



In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Select relevant features for Autoencoder
df_selected = df[features]

# Encode categorical features
df_selected = pd.get_dummies(df_selected)

# Normalize numerical features
scaler = StandardScaler()
batch_size = 10000
df_scaled_list = []

for i in range(0, len(df_selected), batch_size):
    batch = df_selected.iloc[i : i + batch_size]
    df_scaled_list.append(scaler.fit_transform(batch))

df_scaled = df_scaled_list[0]  # Start with the first batch

for batch in df_scaled_list[1:]:
    df_scaled = np.concatenate((df_scaled, batch), axis=0)  # Incrementally add batches

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Dense

# Define input size
input_dim = df_scaled.shape[1]

# Build Autoencoder model
input_layer = Input(shape=(input_dim,))
encoded = Dense(8, activation="relu")(input_layer)
encoded = Dense(4, activation="relu")(encoded)
decoded = Dense(8, activation="relu")(encoded)
decoded = Dense(input_dim, activation="sigmoid")(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer="adam", loss="mse")

# Train the autoencoder
autoencoder.fit(df_scaled, df_scaled, epochs=50, batch_size=32, shuffle=True)


In [None]:
# Reconstruct transactions
reconstructed = autoencoder.predict(df_scaled)

# Compute reconstruction errors
mse = np.mean(np.abs(df_scaled - reconstructed), axis=1)

# Set a threshold for fraud (e.g., top 5% of errors)
threshold = np.percentile(mse, 99.6)

# Detect fraud (1 = fraud, 0 = normal)
df["fraud_detected_autoencoder"] = (mse > threshold).astype(int)


In [None]:
similarity_percentage = (df["fraud_detected_isoforest"] == df["fraud_detected_autoencoder"]).mean() * 100
print(f"Similarity between Isolation Forest and Autoencoder fraud detection: {similarity_percentage:.2f}%")