In [1]:
# Importing necessary libraries and modules 

import subprocess

try:

    import os
    import pandas as pd
    import numpy as np
    import missingno as msno 
    import seaborn as sns
    import matplotlib.pyplot as plt 
    import re

    import time
    from memory_profiler import profile

    # object serialization
    import joblib


    # split data - avoid data leakage
    from sklearn.model_selection import train_test_split, cross_val_score


    # cross validation, hyperparameter tuning
    from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold 

    # preprocessing: scaling, encoding
    from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
    from sklearn.compose import ColumnTransformer



except ImportError as error:
    print(f"Installation of the required dependencies necessary! {error}")

    subprocess.check_call(["pip", "install", "numpy"])
    subprocess.check_call(["pip", "install", "pandas"])
    subprocess.check_call(["pip", "install", "scikit-learn"])


    print(f"Successful installation of the required dependencies necessary")


import warnings
warnings.filterwarnings('ignore')


# custom imports
from dataloader import (
                load_data,
                )



from feature_engineering import (
                rename_columns,
                cast_schema_types,
                column_mapper,
                counter_statue_mapper,
                create_new_feautures,
                drop_column,
                #tariff_type_mapper,
                )

from dataprocessor import (
                read_combine_dataframe,
                data_wrangling,
                )


# import 
# data lakes aws


from storage import (
    configure_aws_session,
    )




In [2]:
# 
# get relative path for root data directory  file

current_dir = os.path.dirname(os.path.abspath('__file__'))

#print(f''' current directory :
#        {current_dir} ''')

root_path = os.path.normpath(os.path.join(current_dir, '../data/raw/'))

#print(f''' relative path  :
#        {root_path} ''')

In [3]:
# read in data

data = read_combine_dataframe(root_path)

data.head()


Unnamed: 0,disrict,client_id,client_catg,region,creation_date,target,invoice_date,tarif_type,counter_number,counter_statue,...,reading_remarque,counter_coefficient,consommation_level_1,consommation_level_2,consommation_level_3,consommation_level_4,old_index,new_index,months_number,counter_type
0,60,train_Client_0,11,101,31/12/1994,0.0,2014-03-24,11,1335667,0,...,8,1,82,0,0,0,14302,14384,4,ELEC
1,60,train_Client_0,11,101,31/12/1994,0.0,2013-03-29,11,1335667,0,...,6,1,1200,184,0,0,12294,13678,4,ELEC
2,60,train_Client_0,11,101,31/12/1994,0.0,2015-03-23,11,1335667,0,...,8,1,123,0,0,0,14624,14747,4,ELEC
3,60,train_Client_0,11,101,31/12/1994,0.0,2015-07-13,11,1335667,0,...,8,1,102,0,0,0,14747,14849,4,ELEC
4,60,train_Client_0,11,101,31/12/1994,0.0,2016-11-17,11,1335667,0,...,9,1,572,0,0,0,15066,15638,12,ELEC


In [4]:
# data wrangling


data = data_wrangling(data)

data.head()


done
numbers of rows : 4476749
done
done
done
done
done
done
Assumed loss per fraudulent client: 26434.047052603753
done
done


Unnamed: 0,disrict,client_id,client_category,region,creation_date,target,invoice_date,tarif_type,counter_number,counter_statue,...,old_index,new_index,months_number,counter_type,member_years,index_change,index_change_per_month,fraudulent_transactions,total_client_transactions,amount_lost_per_client
0,60,train_Client_0,11,101,1994,0,2014,11,1335667,0.0,...,14302,14384,4,ELEC,20,82,20.5,0,35,0.0
1,60,train_Client_0,11,101,1994,0,2013,11,1335667,0.0,...,12294,13678,4,ELEC,19,1384,346.0,0,35,0.0
2,60,train_Client_0,11,101,1994,0,2015,11,1335667,0.0,...,14624,14747,4,ELEC,21,123,30.75,0,35,0.0
3,60,train_Client_0,11,101,1994,0,2015,11,1335667,0.0,...,14747,14849,4,ELEC,21,102,25.5,0,35,0.0
4,60,train_Client_0,11,101,1994,0,2016,11,1335667,0.0,...,15066,15638,12,ELEC,22,572,47.666667,0,35,0.0


In [5]:
#data.amount_lost_per_client.unique()

In [20]:
# top 10 fraudulent loss

fraud_clients = data.groupby('client_id')['amount_lost_per_client'].sum()
top_fraudulent_clients = fraud_clients.sort_values(ascending=False).head(10)
top_fraudulent_clients


client_id
train_Client_2726      662728.580515
train_Client_16431     653018.402892
train_Client_30458     653018.402892
train_Client_43015     638587.502383
train_Client_19164     614894.310673
train_Client_115340    596262.235529
train_Client_90935     559858.027170
train_Client_111177    546502.054074
train_Client_16309     520273.825217
train_Client_89259     494690.552808
Name: amount_lost_per_client, dtype: float64

In [21]:
sum(top_fraudulent_clients)



5939833.894154316

In [25]:
total_money_lost = 200000000


0.9703008305292284

In [26]:


total_money_lost = 200000000
top_10_fraud_clients = ((total_money_lost - sum(top_fraudulent_clients))/total_money_lost) * 100
top_10_fraud_clients

97.03008305292285