In [30]:
import pandas as pd
import sklearn.cluster
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
#let's see if we can make UID's by clustering 

#here we read in data
kaggle_set = pd.read_csv('/Users/sara/Desktop/train_transaction.csv')
kaggle_id_set = pd.read_csv('/Users/sara/Desktop/train_identity.csv')

In [None]:
#pull out the categorical card identifiers
id_df = kaggle_set[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']].copy()
id_df = id_df.drop_duplicates()

#count the number of remaining rows 
print("Number of unique identifiers:", id_df.shape[0])


Number of unique identifiers: 43071


In [89]:
#let's preprocess the data for use of an isolation forest 

# Drop the labels so they can't be used as a factor
df = kaggle_set.copy()

# merge in the identity dataset
df = df.merge(kaggle_id_set, on=['TransactionID'], how='left')

# Sort by transaction date from most recent to oldest
df = df.sort_values('TransactionDT',ascending=False)  


# Add a count of nans in columns. Not using one-hot encoding because it will cluster fraud cases together. 
df['nan_count'] = df.isnull().sum(axis=1)


# Drop columns that we don't want to tranform so we can use them later
df2 = df.drop(columns=['TransactionID', 'TransactionDT', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud']).copy()


# Separate into categorical and numerical factors
categorical_df = df2.select_dtypes(include=['object']).copy()
numerical_df = df2.select_dtypes(exclude=['object']).copy()

# Fill missing values in categorical with the term 'missing'
categorical_df = categorical_df.fillna('missing')  
# use label encoding for categorical columns 
categorical_df = categorical_df.apply(lambda x: LabelEncoder().fit_transform(x.astype(str)))


# Fill missing values in numerical columns with the mean of the column and normalize
for column in numerical_df.columns:
    # Convert numerical columns to float type and Z-normalize
    numerical_df[column] = numerical_df[column].astype(float)
    numerical_df[column] = (numerical_df[column]- numerical_df[column].mean())/ numerical_df[column].std()

    # Fill missing values with the mean of the column (0 after normalization)
    numerical_df[column] = numerical_df[column].fillna(numerical_df[column].mean())


In [90]:
#Use PCA to reduce dimensionality in numerical data
pca = PCA(n_components=0.90)  # Keep 79% of variance (I think this was Adrian's number but correct me if I'm wrong)
our_pca = pca.fit_transform(numerical_df.drop(columns=['nan_count']))

df_pca = pd.DataFrame(our_pca, columns=[f'PC{i+1}' for i in range(our_pca.shape[1])])

# add the categorical, idenfitication, isFraud, and nancount columns back to the PCA dataframe
df_pca = pd.concat([df_pca, categorical_df.reset_index(drop=True)], axis = 1)
df_pca[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud']] = df[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud']].reset_index(drop=True)
df_pca['nan_count'] = numerical_df['nan_count'].reset_index(drop=True)

In [91]:
#list columns in PCA set
print("Columns in PCA set:", df_pca.columns.tolist())

Columns in PCA set: ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'ProductCD', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud', 'nan_count']


In [95]:
#group by the card identifiers to reduce the number of similar rows
grouped_df_first = df_pca.groupby(['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']).first().reset_index()
grouped_df_max = df_pca.groupby(['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2']).max().reset_index()

#drop columns that are only needed for identification and not for clustering
grouped_df_final = grouped_df_first.drop(columns=['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud'])


In [129]:
#initialize the Isolation Forest model

def detect_anomalies(data, contamination, random_state, n_estimators, max_samples, bootstrap, max_features):
#takes a DataFrame, applies Isolation Forest, and returns a DataFrame with anomaly scores and predictions.

    # Step 4: Isolation Forest
    model = IsolationForest(contamination=contamination, random_state=random_state, n_estimators=n_estimators, max_samples=max_samples, bootstrap=bootstrap, max_features=max_features)
    model.fit(data)

    scores = model.decision_function(data)
    preds = model.predict(data)
    scaled_scores = 1 - MinMaxScaler().fit_transform(scores.reshape(-1, 1))

    # Step 5: Package results
    results = data.copy()
    results['anomaly_score'] = scores
    results['anomaly_likelihood'] = scaled_scores.flatten()
    results['is_anomaly'] = (preds == -1).astype(int)

    return results

In [120]:
# run isolation forest on the grouped data
results = detect_anomalies(grouped_df_final)

# add the identifiers and isFraud back to the results
results[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'isFraud']] = grouped_df_max[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2','isFraud']]

#apply the fraud determination to each group in the original dataset 
herewego = results[['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'anomaly_score', 'anomaly_likelihood', 'is_anomaly', 'isFraud']]

In [121]:
results['final_ans']= results['isFraud']*4 - results['is_anomaly']

TP_count = (results['final_ans'] == 3).sum() # True positive is 4-1 = 3
FP_count = (results['final_ans'] == -1).sum() # False positive is 0-1 = -1
FN_count = (results['final_ans'] == 4).sum() # False negative is 4-0 = 4
TN_count = (results['final_ans'] == 0).sum() # True negative is 0-0 = 0

Precision = TP_count / (TP_count + FP_count) if (TP_count + FP_count) > 0 else 0
Recall = TP_count / (TP_count + FN_count) if (TP_count + FN_count) > 0 else 0
F1_score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0

print(f"Precision: {Precision:.4f}, Recall: {Recall:.4f}, F1 Score: {F1_score:.4f}")

Precision: 0.1019, Recall: 0.5161, F1 Score: 0.1701


In [None]:
def find_optimal_contamination(data, fraudCompareData ,samplerange=np.arange(250, 2000, 50)):
    best_f1 = 0
    best_contamination = 0

    for contaminations in samplerange:
        results = detect_anomalies(data, n_estimators=contaminations)
        results['isFraud'] = fraudCompareData['isFraud'] 
        results['final_ans']= results['isFraud']*4 - results['is_anomaly']
        
        TP_count = (results['final_ans'] == 3).sum()
        FP_count = (results['final_ans'] == -1).sum()
        FN_count = (results['final_ans'] == 4).sum()

        Precision = TP_count / (TP_count + FP_count) if (TP_count + FP_count) > 0 else 0
        Recall = TP_count / (TP_count + FN_count) if (TP_count + FN_count) > 0 else 0
        F1_score = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0
        print(f"Contamination: {contaminations:.2f}, F1 Score: {F1_score:.4f}, Precision: {Precision:.4f}, Recall: {Recall:.4f}")

        if F1_score > best_f1:
            best_f1 = F1_score
            best_contamination = contaminations

    return best_contamination, best_f1

In [128]:
find_optimal_contamination(grouped_df_final, grouped_df_max)

Contamination: 250.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 300.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 350.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 400.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 450.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 500.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 550.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 600.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 650.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 700.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 750.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 800.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 850.00, F1 Score: 0.1701, Precision: 0.1019, Recall: 0.5161
Contamination: 900.00, F1

KeyboardInterrupt: 