# Fraud Detection in Electricity and Gas Consumption Challenge

## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm
from lightgbm import LGBMClassifier

import warnings
warnings.simplefilter('ignore')

## Read the Data

In [2]:
client_train = pd.read_csv('client_train.csv', low_memory=False)
invoice_train = pd.read_csv('invoice_train.csv', low_memory=False)

client_test = pd.read_csv('client_test.csv', low_memory=False)
invoice_test = pd.read_csv('invoice_test.csv', low_memory=False)

## Data Preprocessing

In [3]:
#convert the column invoice_date to date time format on both the invoice train and invoice test
for df in [invoice_train,invoice_test]:
    df['invoice_date'] = pd.to_datetime(df['invoice_date'])

In [4]:
#encode labels in categorical column
d={"ELEC":0,"GAZ":1}
invoice_train['counter_type']=invoice_train['counter_type'].map(d)
invoice_test['counter_type']=invoice_test['counter_type'].map(d)

In [5]:
#convert categorical columns to int for model
client_train['client_catg'] = client_train['client_catg'].astype(int)
client_train['disrict'] = client_train['disrict'].astype(int)

client_test['client_catg'] = client_test['client_catg'].astype(int)
client_test['disrict'] = client_test['disrict'].astype(int)

## Feature Engineering

In [6]:
def aggregate_by_client_id(invoice_data):
    aggs = {}
    aggs['consommation_level_1'] = ['mean']
    aggs['consommation_level_2'] = ['mean']
    aggs['consommation_level_3'] = ['mean']
    aggs['consommation_level_4'] = ['mean']

    agg_trans = invoice_data.groupby(['client_id']).agg(aggs)
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = (invoice_data.groupby('client_id')
            .size()
            .reset_index(name='{}transactions_count'.format('1')))
    return pd.merge(df, agg_trans, on='client_id', how='left')

In [7]:
#group invoice data by client_id
agg_train = aggregate_by_client_id(invoice_train)

In [8]:
#merge aggregate data with client dataset
train = pd.merge(client_train,agg_train, on='client_id', how='left')

In [9]:
#aggregate test set
agg_test = aggregate_by_client_id(invoice_test)
test = pd.merge(client_test,agg_test, on='client_id', how='left')

In [10]:
#drop redundant columns
sub_client_id = test['client_id']
drop_columns = ['client_id', 'creation_date']

for col in drop_columns:
    if col in train.columns:
        train.drop([col], axis=1, inplace=True)
    if col in test.columns:
        test.drop([col], axis=1, inplace=True)

In [23]:
test.shape

(58069, 8)

# Modelling

In [11]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC

In [12]:
# Define models
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": XGBClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

In [13]:
# Dictionary to store metrics for each model
model_metrics = {}

# dictionary to store trained models
trained_models = {}

In [14]:
# Prepare data for modeling
X = train.drop(columns=['target'])
y = train['target']

In [15]:
# perform train test split first
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Define k-fold cross-validation
k = 5
stkfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [None]:
# check for imbalance
target_distribution = train['target'].value_counts(normalize=True)
print("Target Variable Distribution:")
print(target_distribution)

In [17]:
import time
from imblearn.over_sampling import SMOTE

In [18]:
# maximum time for training in seconds
max_training_time = 3600

In [None]:
# handle imbalance using SMOTE
#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [19]:
import threading

class TimeoutError(Exception):
    pass

def timeout(seconds=1, error_message="Timeout occurred"):
    def decorator(func):
        def wrapper(*args, **kwargs):
            result = [TimeoutError(error_message)]
            def target():
                result[0] = func(*args, **kwargs)
            thread = threading.Thread(target=target)
            thread.start()
            thread.join(seconds)
            if thread.is_alive():
                thread.join()  # Ensures that the thread is terminated
                raise TimeoutError(error_message)
            return result[0]
        return wrapper
    return decorator

In [21]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    start_time = time.time()

    try:
        # Perform cross-validation with a timeout
        @timeout(max_training_time)
        def cross_validation_with_timeout():
            return cross_val_score(model, X_train, y_train, cv=stkfold, scoring='accuracy')
        
        scores = cross_validation_with_timeout()

        # calculate the elapsed time for training
        elapsed_time = time.time() - start_time
        print(f"    Elapsed Time: {elapsed_time:.2f} seconds")

    except TimeoutError:
        print(f"    Training {model_name} exceeded the maximum training time of {max_training_time} seconds. Skipping...")
        continue
    
    # Store metrics in dictionary
    model_metrics[model_name] = {
        "Accuracy": scores.mean(),
        "Precision": 0,  # Placeholder for precision
        "Recall": 0,     # Placeholder for recall
        "F1 Score": 0,   # Placeholder for F1 score
        "ROC AUC": 0     # Placeholder for ROC AUC
    }

    print(f"    Average Accuracy: {scores.mean():.4f}")
    print()

    # Fit the model on the entire training data
    model.fit(X_train, y_train)

    # store trained model
    trained_models[model_name] = model

    # Calculate precision, recall, F1 score, and ROC AUC using the entire training data
    y_pred = model.predict(X_test)
    model_metrics[model_name]["Precision"] = precision_score(y_test, y_pred)
    model_metrics[model_name]["Recall"] = recall_score(y_test, y_pred)
    model_metrics[model_name]["F1 Score"] = f1_score(y_test, y_pred)
    model_metrics[model_name]["ROC AUC"] = roc_auc_score(y_test, y_pred)

    print(f"    Precision: {model_metrics[model_name]['Precision']:.4f}")
    print(f"    Recall: {model_metrics[model_name]['Recall']:.4f}")
    print(f"    F1 Score: {model_metrics[model_name]['F1 Score']:.4f}")
    print(f"    ROC AUC: {model_metrics[model_name]['ROC AUC']:.4f}")
    print()


Training Random Forest...
    Elapsed Time: 196.16 seconds
    Average Accuracy: 0.9422

    Precision: 0.2222
    Recall: 0.0141
    F1 Score: 0.0265
    ROC AUC: 0.5055

Training Logistic Regression...
    Elapsed Time: 6.40 seconds
    Average Accuracy: 0.9441

    Precision: 0.2000
    Recall: 0.0019
    F1 Score: 0.0038
    ROC AUC: 0.5007

Training Decision Tree...
    Elapsed Time: 6.39 seconds
    Average Accuracy: 0.8955

    Precision: 0.1180
    Recall: 0.1304
    F1 Score: 0.1239
    ROC AUC: 0.5354

Training Gradient Boosting...
    Elapsed Time: 182.12 seconds
    Average Accuracy: 0.9445

    Precision: 0.1667
    Recall: 0.0006
    F1 Score: 0.0013
    ROC AUC: 0.5002

Training KNN...
    Elapsed Time: 26.28 seconds
    Average Accuracy: 0.9412

    Precision: 0.1268
    Recall: 0.0115
    F1 Score: 0.0211
    ROC AUC: 0.5033

Training Naive Bayes...
    Elapsed Time: 0.52 seconds
    Average Accuracy: 0.9340

    Precision: 0.1842
    Recall: 0.0537
    F1 Score: 0.083

In [None]:
# Categorize models based on performance
categorized_models = {}
for model_name, metrics in model_metrics.items():
    if metrics["Accuracy"] >= 0.8 and metrics["ROC AUC"] >= 0.8:
        categorized_models[model_name] = "Great"
    elif metrics["Accuracy"] >= 0.7 and metrics["ROC AUC"] >= 0.7:
        categorized_models[model_name] = "Good"
    elif metrics["Accuracy"] >= 0.6 and metrics["ROC AUC"] >= 0.6:
        categorized_models[model_name] = "Okay"
    else:
        categorized_models[model_name] = "Nuh"

# Print categorized models
for model_name, category in categorized_models.items():
    print(f"{model_name}: {category}")

In [None]:
# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Evaluate the models on the test data
test_metrics = {}
for model_name, model in trained_models.items():
    print(f"Evaluating {model_name} on the test data...")
    
    # Make predictions on the test data
    y_pred = model.predict(test)

    # initialize test_metrics[model_name]
    if model_name not in test_metrics:
        test_metrics[model_name] = {}

    # assign predictions to test_metrics[model_name]
    test_metrics[model_name]["predictions"] = y_pred

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    # Store evaluation metrics in test_metrics[model_name]
    test_metrics[model_name]["Accuracy"] = accuracy
    test_metrics[model_name]["Precision"] = precision
    test_metrics[model_name]["Recall"] = recall
    test_metrics[model_name]["F1 Score"] = f1
    test_metrics[model_name]["ROC AUC"] = roc_auc

In [None]:
# Print test metrics
for model_name, metrics in test_metrics.items():
    print(f"\n{model_name} Test Metrics:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value}")

#### Submissions

In [None]:
# Function to generate submission CSV file for a model
def generate_submission_csv(model_name, predictions):
    submission_df = pd.DataFrame({
        'client_id': sub_client_id,
        'target': predictions
    })
    submission_df.to_csv(f'{model_name}_submission.csv', index=False)
    print(f'Submission CSV file generated for {model_name}')


In [None]:
# Generate submission CSV files for models that predicted successfully
for model_name, metrics in test_metrics.items():
    print(f"{model_name}...")
    if 'predictions' in metrics:  # Check if predictions are available
        generate_submission_csv(model_name, metrics['predictions'])
    else:
        print(f"No predictions available for {model_name}. Skipping...")


In [None]:
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    roc_auc = roc_auc_score(y_val, y_pred)
    
    # Store metrics in dictionary
    model_metrics[model_name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC AUC": roc_auc
    }

    print(f"    Accuracy: {accuracy:.4f}")
    print(f"    Precision: {precision:.4f}")
    print(f"    Recall: {recall:.4f}")
    print(f"    F1 Score: {f1:.4f}")
    print(f"    ROC AUC: {roc_auc:.4f}")
    print()

In [None]:
# Categorize models based on performance
categorized_models = {}
for model_name, metrics in model_metrics.items():
    if metrics["Accuracy"] >= 0.8 and metrics["ROC AUC"] >= 0.8:
        categorized_models[model_name] = "Great"
    elif metrics["Accuracy"] >= 0.7 and metrics["ROC AUC"] >= 0.7:
        categorized_models[model_name] = "Good"
    elif metrics["Accuracy"] >= 0.6 and metrics["ROC AUC"] >= 0.6:
        categorized_models[model_name] = "Okay"
    else:
        categorized_models[model_name] = "Nuh"

In [None]:
# Print categorized models
for model_name, category in categorized_models.items():
    print(f"{model_name}: {category}")

In [None]:
# Print metrics of categorized models
for model_name, category in categorized_models.items():
    print(f"{model_name}: {category}")
    if category == "Nuh":
        metrics = model_metrics[model_name]
        print(f"    Accuracy: {metrics['Accuracy']:.4f}")
        print(f"    Precision: {metrics['Precision']:.4f}")
        print(f"    Recall: {metrics['Recall']:.4f}")
        print(f"    F1 Score: {metrics['F1 Score']:.4f}")
        print(f"    ROC AUC: {metrics['ROC AUC']:.4f}")
    print()


In [None]:
best_model

In [None]:
test_predict = best_model.predict(test)

In [None]:
submission = pd.DataFrame(
    {
        'client_id': sub_client_id,
        'target': test_predict['target']
    }
)

submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)

## Make Predictions on test set

In [None]:
preds = model.predict(test)
preds = pd.DataFrame(preds, columns=['target'])
preds.head()

In [None]:
submission = pd.DataFrame(
    {
        'client_id': sub_client_id,
        'target': preds['target']
    }
)

submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)