In [26]:
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint,uniform
from sklearn.metrics import make_scorer, roc_auc_score, f1_score

In [36]:
# Creamos nuestro propio estimador para que la salida del algoritmo este comprendida entre 0 y 1
from sklearn.base import BaseEstimator

class IsolationForestCustom(BaseEstimator):
    def __init__(self, max_samples=100, contamination=0.1):
        self.contamination = contamination
        self.max_samples = max_samples
        self._ift_clf = None
        
    def fit(self, X, y=None):
        self._ift_clf = IsolationForest(max_samples=self.max_samples, 
                                        contamination=self.contamination)
        self._ift_clf.fit(X) 
        return self
    
    def predict(self, X, y=None):
        preds = self._ift_clf.predict(X)
        preds[preds==1] = 0
        preds[preds==-1] = 1
        return preds
    
    def get_params(self, deep=True):
        return {"contamination": self.contamination,
               "max_samples": self.max_samples}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

In [37]:
# Load your dataset
df = pd.read_csv("fraud_reduced.csv", index_col=0)

# Split your data into features (X) and the target variable (y)
X = df.drop("Class", axis=1)
y = df["Class"].copy()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.33, stratify=y)

param_grid = {
    'max_samples': [100, 200, 250, 300],
    'contamination': [0.001, 0.01, 0.1],
    'max_features': [0.9, 0.8, 1]}
# Initialize the Isolation Forest model
ift_clf = IsolationForestCustom()

# Fit the Isolation Forest model to the entire dataset (X)
ift_clf.fit(X)

# Create a scoring function for RandomizedSearchCV
scorer = 'f1'

# Initialize and run RandomizedSearchCV
grid_dt_estimator = GridSearchCV(ift_clf,
                                       param_grid=param_grid,
                                       scoring= scorer,
                                       refit=True,
                                       cv=10,
                                       return_train_score=True)
grid_dt_estimator.fit(X_train, y_train)



In [38]:
grid_dt_estimator.best_params_

{'contamination': 0.001, 'max_features': 0.9, 'max_samples': 250}

In [44]:
cvres = grid_dt_estimator.cv_results_
results = [(mean_score, params) for mean_score, params in zip(cvres["mean_test_score"], cvres["params"])]
results.sort(reverse=True, key=lambda x: x[0])
for mean_score, params in results:
    print("f1", mean_score, "-", "Parameters:", params)

f1 0.6029967085929941 - Parameters: {'contamination': 0.001, 'max_features': 0.9, 'max_samples': 250}
f1 0.598427765352184 - Parameters: {'contamination': 0.001, 'max_features': 1, 'max_samples': 300}
f1 0.5983991505646505 - Parameters: {'contamination': 0.001, 'max_features': 0.8, 'max_samples': 300}
f1 0.5961348439114312 - Parameters: {'contamination': 0.001, 'max_features': 0.8, 'max_samples': 100}
f1 0.5932505287702021 - Parameters: {'contamination': 0.001, 'max_features': 0.9, 'max_samples': 300}
f1 0.592699637038155 - Parameters: {'contamination': 0.001, 'max_features': 0.9, 'max_samples': 100}
f1 0.5891818803721719 - Parameters: {'contamination': 0.001, 'max_features': 0.8, 'max_samples': 250}
f1 0.587582235145599 - Parameters: {'contamination': 0.001, 'max_features': 1, 'max_samples': 100}
f1 0.5863915452025539 - Parameters: {'contamination': 0.001, 'max_features': 0.9, 'max_samples': 200}
f1 0.5848160133421925 - Parameters: {'contamination': 0.001, 'max_features': 0.8, 'max_sa

In [40]:
isoaltion_forest = grid_dt_estimator.best_estimator_

In [41]:
anomalies = isoaltion_forest.predict(X)

In [50]:
print("Total de anomalías identificadas:", len(y[anomalies==1]))

Total de anomalías identificadas: 285


In [51]:
y[anomalies==1].value_counts()

Class
1    239
0     46
Name: count, dtype: int64