In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm
from lightgbm import LGBMClassifier

import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)
print('Finished ...')

Finished ...


In [2]:
client_train=pd.read_csv('client_train.csv')
client_test=pd.read_csv('client_test.csv')
invoice_train=pd.read_csv('invoice_train.csv')
invoice_test=pd.read_csv('invoice_test.csv')
print('Finished data read...')

Finished data read...


In [3]:
def extract_date_components_client(df, date_column):
    # Convert the date string column to pandas datetime objects with specified formats
    df[date_column] = pd.to_datetime(df[date_column], format="%d/%m/%Y" )
   
    # Modify the 'date_column' to contain the extracted year, month, and day components
    df['creation_year'] = df[date_column].dt.year
    df['creation_month'] = df[date_column].dt.month
    df['creation_day'] = df[date_column].dt.day

    # Drop the original date string column if desired
    df.drop(date_column, axis=1, inplace=True)

    return df

In [4]:
client_train = extract_date_components_client(client_train, 'creation_date')
#train = extract_date_components_invoice(train, 'invoice_date')


client_test = extract_date_components_client(client_test, 'creation_date')
#test = extract_date_components_invoice(test, 'invoice_date')

In [5]:
#convert the column invoice_date to date time format on both the invoice train and invoice test
for df in [invoice_train,invoice_test]:
    df['invoice_date'] = pd.to_datetime(df['invoice_date'])

In [6]:
#encode labels in categorical column
d={"ELEC":0,"GAZ":1}
invoice_train['counter_type']=invoice_train['counter_type'].map(d)
invoice_test['counter_type']=invoice_test['counter_type'].map(d)

In [7]:
#convert categorical columns to int for model
client_train['client_catg'] = client_train['client_catg'].astype(int)
client_train['disrict'] = client_train['disrict'].astype(int)

client_test['client_catg'] = client_test['client_catg'].astype(int)
client_test['disrict'] = client_test['disrict'].astype(int)


In [8]:
def aggregate_by_client_id(invoice_data):
    aggs = {}
    aggs['consommation_level_1'] = ['mean', 'sum', 'min', 'max']
    aggs['consommation_level_2'] = ['mean', 'sum', 'min', 'max']
    aggs['consommation_level_3'] = ['mean', 'sum', 'min', 'max']
    aggs['consommation_level_4'] = ['mean', 'sum', 'min', 'max']
    aggs['reading_remarque'] = ['mean', 'min', 'max']
    aggs['counter_code'] = ['mean', 'min', 'max']
    aggs['old_index'] = ['min','max']
    aggs['new_index'] = ['min','max']



    agg_trans = invoice_data.groupby(['client_id']).agg(aggs)
    agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)

    df = (invoice_data.groupby('client_id')
            .size()
            .reset_index(name='{}transactions_count'.format('1')))
    return pd.merge(df, agg_trans, on='client_id', how='left')


In [9]:
#group invoice data by client_id
agg_train = aggregate_by_client_id(invoice_train)

In [10]:
#merge aggregate data with client dataset
train = pd.merge(client_train,agg_train, on='client_id', how='left')

In [11]:
#aggregate test set
agg_test = aggregate_by_client_id(invoice_test)
test = pd.merge(client_test,agg_test, on='client_id', how='left')

In [12]:
def calculate_diff_months(df, date_column='invoice_date', id_column='client_id'):
    # Convert the date string column to pandas datetime objects
    df[date_column] = pd.to_datetime(df[date_column])

    # Group by 'id_column' and calculate the difference between the maximum and minimum dates for each group
    date_diff_per_id = df.groupby(id_column)[date_column].agg(lambda x: x.max() - x.min())

    # Convert the difference to the number of months
    date_diff_per_id = date_diff_per_id.apply(lambda x: x.days)  # Convert to numeric days difference
    date_diff_per_id = date_diff_per_id / 30  # Divide by 30 to get the number of months

    # Add the new column "diff_months" to the DataFrame
    df["diff_months"] = df[id_column].map(date_diff_per_id)

    return df
invoice_train = calculate_diff_months(invoice_train, date_column='invoice_date', id_column='client_id')

columns_to_drop=['invoice_date','counter_statue','reading_remarque','counter_code','counter_coefficient','old_index','new_index']
invoice_train.drop(columns=columns_to_drop, inplace=True)
#train['index_difference'] = train['new_index_mean'] - train['old_index_mean']
invoice_train = invoice_train.drop_duplicates(subset='client_id', keep='first')
train = pd.merge(train,invoice_train, on='client_id', how='left')

In [13]:
# Assuming you have the functions and DataFrame 'invoice_train' defined previously

# Step 1: Calculate the diff_months and drop unnecessary columns in 'invoice_train'
invoice_test = calculate_diff_months(invoice_test, date_column='invoice_date', id_column='client_id')

columns_to_drop = ['invoice_date', 'counter_statue', 'reading_remarque', 'counter_code', 'counter_coefficient',
                   'old_index', 'new_index',]
invoice_test.drop(columns=columns_to_drop, inplace=True)

#test['index_difference'] = test['new_index_mean'] - test['old_index_mean']

# Step 3: Drop duplicates in 'invoice_train' based on 'client_id'
invoice_test = invoice_test.drop_duplicates(subset='client_id', keep='first')



In [14]:
# Step 4: Merge 'invoice_train' with 'train' on 'client_id'
test = pd.merge(test, invoice_test, on='client_id', how='left')

In [15]:
train['totale_consommation']=train['consommation_level_1']+train['consommation_level_2']+train['consommation_level_3']+train['consommation_level_4']
test['totale_consommation']=test['consommation_level_1']+test['consommation_level_2']+test['consommation_level_3']+test['consommation_level_4']
train = pd.get_dummies(train, columns=['disrict'])
test = pd.get_dummies(test, columns=['disrict'])

In [16]:
#drop redundant columns
sub_client_id = test['client_id']
drop_columns = ['client_id', 'creation_date']

for col in drop_columns:
    if col in train.columns:
        train.drop([col], axis=1, inplace=True)
    if col in test.columns:
        test.drop([col], axis=1, inplace=True)

MODELLING

In [17]:
X = train.drop(columns=['target'])
y = train['target']

In [18]:
# split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Create LightGBM dataset for training
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

In [20]:
# Count the number of samples for each target value (0 and 1)
counts = y_train.value_counts()

# Calculate the weight for each target value
class_weights = {0: 1.0, 1: counts[0] / counts[1]}

# Create LightGBM dataset
#train_data = lgb.Dataset(x_train, label=y_train, weight=y_train.map(class_weights))

# Define LightGBM parameters
lgb_params = {
    'objective': 'binary',  # Regression task
    'boosting_type': 'gbdt',
    'metric': 'binary_error',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 7,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 1000,
    'random_state': 40,
    'early_stopping_rounds': 50,
    'verbose_eval': 10
}

In [21]:
# Train LightGBM model with weighted samples
model = lgb.train(lgb_params, 
                  train_data, 
                  valid_sets=[train_data, val_data]
                 )

[LightGBM] [Info] Number of positive: 6002, number of negative: 102392
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053348 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7561
[LightGBM] [Info] Number of data points in the train set: 108394, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055372 -> initscore=-2.836716
[LightGBM] [Info] Start training from score -2.836716
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's binary_error: 0.0553721	valid_1's binary_error: 0.0577143


In [22]:
# Predict on the validation set
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
y_pred_val_binary = [1 if pred > 0.5 else 0 for pred in y_pred_val]

In [23]:
# Calculate accuracy on the validation set
accuracy_val = accuracy_score(y_val, y_pred_val_binary)
print("Validation Accuracy:", accuracy_val*100)

Validation Accuracy: 94.2285693199011


In [24]:
preds = model.predict(test)
preds = pd.DataFrame(preds, columns=['target'])
preds.head()

Unnamed: 0,target
0,0.055135
1,0.055832
2,0.055928
3,0.054969
4,0.055928


In [25]:
submission = pd.DataFrame(
    {
        'client_id': sub_client_id,
        'target': preds['target']
    }
)

submission.head()

Unnamed: 0,client_id,target
0,test_Client_0,0.055135
1,test_Client_1,0.055832
2,test_Client_10,0.055928
3,test_Client_100,0.054969
4,test_Client_1000,0.055928


In [26]:
submission.to_csv('submission_fraud-3.csv',index=False)