# Fraud Detection in Electricity and Gas Consumption Challenge

## Import Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm
from lightgbm import LGBMClassifier

import warnings
warnings.simplefilter('ignore')

## Read the Data

In [3]:
client_train = pd.read_csv('client_train.csv')
invoice_train = pd.read_csv('invoice_train.csv')

client_test = pd.read_csv('client_test.csv')
invoice_test = pd.read_csv('invoice_test.csv')

In [None]:
#compare size of the various datasets
print(client_train.shape, invoice_train.shape, client_test.shape, invoice_train.shape)

In [None]:
#print top rows of dataset
invoice_train.head()

In [None]:
#print top rows of dataset
client_train.head()

In [None]:
#Get a summary for all numerical columns
invoice_train.describe()

In [None]:
#Get a summary for all numerical columns
client_train.describe()

In [None]:
#Get concise information of each column in dataset
invoice_train.info()

In [None]:
#Get concise information of each column in dataset
client_train.info()

In [None]:
#Getting unique values on the invoice train data
for col in invoice_train.columns:
    print(f"{col} - {invoice_train[col].nunique()}")

In [None]:
#Getting unique values on the invoice train data
for col in client_train.columns:
    print(f"{col} - {client_train[col].nunique()}")

In [None]:
#check for missing values
invoice_train.isnull().sum()

In [None]:
#check for missing values
client_train.isnull().sum()

No missing values in train set

In [None]:
#Visualize fraudulent activities
fraudactivities = client_train.groupby(['target'])['client_id'].count()
plt.bar(x=fraudactivities.index, height=fraudactivities.values, tick_label = [0,1])
plt.title('Fraud - Target Distribution')
plt.show()

Target is highly imbalanced with fewer cases of fraudulent activities

In [None]:
#Visualize client distribution across districts and regions
for col in ['disrict','region']:
    region = client_train.groupby([col])['client_id'].count()
    plt.bar(x=region.index, height=region.values)
    plt.title(col+' distribution')
    plt.show()

## Feature Engineering

In [4]:
#convert the column invoice_date to date time format on both the invoice train and invoice test
for df in [invoice_train,invoice_test]:
    df['invoice_date'] = pd.to_datetime(df['invoice_date'])

In [5]:
#encode labels in categorical column
d={"ELEC":0,"GAZ":1}
invoice_train['counter_type']=invoice_train['counter_type'].map(d)
invoice_test['counter_type']=invoice_test['counter_type'].map(d)

In [6]:
#convert categorical columns to int for model
client_train['client_catg'] = client_train['client_catg'].astype(int)
client_train['disrict'] = client_train['disrict'].astype(int)

client_test['client_catg'] = client_test['client_catg'].astype(int)
client_test['disrict'] = client_test['disrict'].astype(int)

In [7]:
def aggregate_by_client_id(invoice_data):
    aggs = {}
    aggs['consommation_level_1'] = ['mean']
    aggs['consommation_level_2'] = ['mean']
    aggs['consommation_level_3'] = ['mean']
    aggs['consommation_level_4'] = ['mean']

    agg_trans = invoice_data.groupby(['client_id']).agg(aggs)
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = (invoice_data.groupby('client_id')
            .size()
            .reset_index(name='{}transactions_count'.format('1')))
    return pd.merge(df, agg_trans, on='client_id', how='left')

In [8]:
#group invoice data by client_id
agg_train = aggregate_by_client_id(invoice_train)

In [9]:
#merge aggregate data with client dataset
train = pd.merge(client_train,agg_train, on='client_id', how='left')

In [10]:
#aggregate test set
agg_test = aggregate_by_client_id(invoice_test)
test = pd.merge(client_test,agg_test, on='client_id', how='left')

In [12]:
#drop redundant columns
sub_client_id = test['client_id']
drop_columns = ['client_id', 'creation_date']

for col in drop_columns:
    if col in train.columns:
        train.drop([col], axis=1, inplace=True)
    if col in test.columns:
        test.drop([col], axis=1, inplace=True)

# Modelling

## Train LGBM *Classifier*

In [17]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [18]:
# Prepare data for modeling
X = train.drop(columns=['target'])
y = train['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Train LightGBM model
#model = lgb.LGBMClassifier(boosting_type='gbdt', num_iterations=500)
#model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=False)

In [None]:
# Evaluate the model
#y_pred = model.predict(X_val)
#accuracy = accuracy_score(y_val, y_pred)
#print("Validation Accuracy:", accuracy)

In [20]:
# Create LightGBM datasets for training and validation
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [23]:
# Define LightGBM parameters
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',  # Use binary_error for classification tasks
    'num_iterations': 500,
    'early_stopping_rounds': 50,  # Early stopping rounds
    'verbose': 10  # Print evaluation results every 10 rounds
}

In [24]:
# Train LightGBM model
model = lgb.train(params,
                  train_data,
                  valid_sets=[train_data, val_data])

[LightGBM] [Info] Number of positive: 6002, number of negative: 102392
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848559
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.375586
[LightGBM] [Debug] init for col-wise cost 0.005684 seconds, init for row-wise cost 0.015652 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008963 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1255
[LightGBM] [Info] Number of data points in the train set: 108394, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055372 -> initscore=-2.836716
[LightGBM] [Info] Start training from score -2.836716
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
Training until validation scores don't improve for 50 rounds
[LightGBM] [

In [25]:
# Predict on validation set
y_pred = model.predict(X_val, num_iteration=model.best_iteration)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]  # Convert probabilities to binary predictions

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred_binary)
print("Validation Accuracy:", accuracy*100)

Validation Accuracy: 94.2285693199011


## Make Predictions on test set

In [28]:
preds = model.predict(test)
preds = pd.DataFrame(preds, columns=['target'])
preds.head()

Unnamed: 0,target
0,0.055098
1,0.067589
2,0.055671
3,0.054133
4,0.055802


In [29]:
submission = pd.DataFrame(
    {
        'client_id': sub_client_id,
        'target': preds['target']
    }
)

submission.head()

Unnamed: 0,client_id,target
0,test_Client_0,0.055098
1,test_Client_1,0.067589
2,test_Client_10,0.055671
3,test_Client_100,0.054133
4,test_Client_1000,0.055802


In [30]:
submission.to_csv('submission.csv', index=False)