In [1]:
import pandas as pd
import pathlib
from typing import List
from pathlib import Path
from pandas import DataFrame

In [2]:
data_source = Path('data').joinpath('data.csv')
data_source

PosixPath('data/data.csv')

In [3]:
df = pd.read_csv(data_source)
df.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,客户类型,国际,是否私行客户,date
0,306.0,414.0,388.0,机构客户,中国,1.0,2022-10-14 22:07:22
1,309.0,145.0,505.0,机构客户,俄罗斯,0.0,2021-12-07 15:18:45
2,840.0,929.0,797.0,机构客户,法国,0.0,2021-06-03 11:22:03
3,908.0,502.0,194.0,私人客户,德国,1.0,2023-10-12 15:46:56
4,319.0,654.0,,私人客户,巴西,0.0,2021-03-24 06:43:11


In [4]:
num_cols = ['numeric_1','numeric_2','numeric_3']
cate_cols = ['客户类型','国际']
class_col = ['是否私行客户']

## Data Process

### Dummy variables

In [5]:
def process_dummies(df:DataFrame,dummy_cols:List) -> DataFrame:
    """Convert categorical variable into dummies"""
    return pd.get_dummies(df,columns=dummy_cols)

In [6]:
df_dummies = process_dummies(df,cate_cols)
df_dummies.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,是否私行客户,date,客户类型_机构客户,客户类型_私人客户,国际_中国,国际_俄罗斯,国际_加拿大,国际_印度,国际_巴西,国际_德国,国际_日本,国际_法国,国际_澳大利亚,国际_美国
0,306.0,414.0,388.0,1.0,2022-10-14 22:07:22,1,0,1,0,0,0,0,0,0,0,0,0
1,309.0,145.0,505.0,0.0,2021-12-07 15:18:45,1,0,0,1,0,0,0,0,0,0,0,0
2,840.0,929.0,797.0,0.0,2021-06-03 11:22:03,1,0,0,0,0,0,0,0,0,1,0,0
3,908.0,502.0,194.0,1.0,2023-10-12 15:46:56,0,1,0,0,0,0,0,1,0,0,0,0
4,319.0,654.0,,0.0,2021-03-24 06:43:11,0,1,0,0,0,0,1,0,0,0,0,0


### NaN values

In [7]:
def null_filler(series: pd.Series, option: str, quantile_val:float = None) -> pd.Series :
    if option == 'mean':
        return series.mean()
    
    if option == 'median':
        return series.median()
    
    if option == 'mode':
        return series.mode()
    
    if option == 'quantile':
        return series.quantile(quantile_val)
    
    
    return 0
    

def fill_null(df:DataFrame, cols:List, options:str | List[str], quantile_val:float=None) -> DataFrame:
    """Fill dataset with differnet options
    options can be median,mean,mode, quantile_1, quantile_2, quantile_3, quntile_4
    """
    _df = df.copy()
    if type(options) == str:
        options = [options] * len(cols)
        
    for col,option in zip(cols,options):
        _val = null_filler(df[col],option, quantile_val)
        _df[col] = _df[col].fillna(_val)
        
    return _df

In [8]:
df_fillby_qcut_25 = fill_null(df.copy(),['numeric_1','numeric_2'],'quantile',0.25)
df_fillby_qcut_25.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,客户类型,国际,是否私行客户,date
0,306.0,414.0,388.0,机构客户,中国,1.0,2022-10-14 22:07:22
1,309.0,145.0,505.0,机构客户,俄罗斯,0.0,2021-12-07 15:18:45
2,840.0,929.0,797.0,机构客户,法国,0.0,2021-06-03 11:22:03
3,908.0,502.0,194.0,私人客户,德国,1.0,2023-10-12 15:46:56
4,319.0,654.0,,私人客户,巴西,0.0,2021-03-24 06:43:11


In [9]:
df_fillby_mean = fill_null(df.copy(),['numeric_1','numeric_2'],'mean')
df_fillby_mean.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,客户类型,国际,是否私行客户,date
0,306.0,414.0,388.0,机构客户,中国,1.0,2022-10-14 22:07:22
1,309.0,145.0,505.0,机构客户,俄罗斯,0.0,2021-12-07 15:18:45
2,840.0,929.0,797.0,机构客户,法国,0.0,2021-06-03 11:22:03
3,908.0,502.0,194.0,私人客户,德国,1.0,2023-10-12 15:46:56
4,319.0,654.0,,私人客户,巴西,0.0,2021-03-24 06:43:11


### Add quntile info

In [10]:
def add_quantiles(df:DataFrame, numeric_cols:List, num_qutiles:int) -> DataFrame:
    """Add quntile info for selecte numeric columns"""
    _df = df.copy()
    for col in numeric_cols:
        _df[f'q_{col}'] = pd.qcut(_df[col],num_qutiles,labels=False)
    return _df

In [11]:
add_quantiles(df,['numeric_1','numeric_2'],10).head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,客户类型,国际,是否私行客户,date,q_numeric_1,q_numeric_2
0,306.0,414.0,388.0,机构客户,中国,1.0,2022-10-14 22:07:22,2.0,3.0
1,309.0,145.0,505.0,机构客户,俄罗斯,0.0,2021-12-07 15:18:45,2.0,0.0
2,840.0,929.0,797.0,机构客户,法国,0.0,2021-06-03 11:22:03,7.0,8.0
3,908.0,502.0,194.0,私人客户,德国,1.0,2023-10-12 15:46:56,8.0,4.0
4,319.0,654.0,,私人客户,巴西,0.0,2021-03-24 06:43:11,2.0,5.0


## Data Analysis

In [12]:
df.describe()

Unnamed: 0,numeric_1,numeric_2,numeric_3,是否私行客户
count,953.0,953.0,946.0,948.0
mean,763.131165,804.160546,783.359408,0.482068
std,1167.484226,1284.926776,1213.948481,0.499942
min,101.0,100.0,101.0,0.0
25%,333.0,343.0,344.25,0.0
50%,572.0,568.0,591.5,0.0
75%,795.0,824.0,817.5,1.0
max,9860.0,9950.0,9940.0,1.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   numeric_1  953 non-null    float64
 1   numeric_2  953 non-null    float64
 2   numeric_3  946 non-null    float64
 3   客户类型       959 non-null    object 
 4   国际         956 non-null    object 
 5   是否私行客户     948 non-null    float64
 6   date       963 non-null    object 
dtypes: float64(4), object(3)
memory usage: 54.8+ KB


## Compose Dataset

In [13]:
import numpy as np
from sklearn.model_selection import train_test_split

In [14]:
data_df = process_dummies(df,cate_cols)
data_df = fill_null(data_df,num_cols,'quantile',0.25)
data_df = add_quantiles(data_df, num_cols, 5)
data_df.head()

Unnamed: 0,numeric_1,numeric_2,numeric_3,是否私行客户,date,客户类型_机构客户,客户类型_私人客户,国际_中国,国际_俄罗斯,国际_加拿大,国际_印度,国际_巴西,国际_德国,国际_日本,国际_法国,国际_澳大利亚,国际_美国,q_numeric_1,q_numeric_2,q_numeric_3
0,306.0,414.0,388.0,1.0,2022-10-14 22:07:22,1,0,1,0,0,0,0,0,0,0,0,0,1,1,1
1,309.0,145.0,505.0,0.0,2021-12-07 15:18:45,1,0,0,1,0,0,0,0,0,0,0,0,1,0,2
2,840.0,929.0,797.0,0.0,2021-06-03 11:22:03,1,0,0,0,0,0,0,0,0,1,0,0,4,4,3
3,908.0,502.0,194.0,1.0,2023-10-12 15:46:56,0,1,0,0,0,0,0,1,0,0,0,0,4,2,0
4,319.0,654.0,344.25,0.0,2021-03-24 06:43:11,0,1,0,0,0,0,1,0,0,0,0,0,1,3,1


In [15]:
target_col = '是否私行客户' # Class column
feature_cols = data_df.columns.to_list() # Features
feature_cols.remove(target_col)
feature_cols.remove('date')

In [16]:
data_df.corrwith(df[target_col]) # Correlation coefficient for eacth feature column

  data_df.corrwith(df[target_col]) # Correlation coefficient for eacth feature column


numeric_1     -0.024873
numeric_2      0.019487
numeric_3      0.052319
是否私行客户         1.000000
客户类型_机构客户      0.040671
客户类型_私人客户     -0.045570
国际_中国          0.006608
国际_俄罗斯        -0.010549
国际_加拿大        -0.032766
国际_印度         -0.040126
国际_巴西         -0.048240
国际_德国          0.043183
国际_日本          0.040160
国际_法国         -0.027720
国际_澳大利亚        0.040579
国际_美国          0.022347
q_numeric_1    0.025893
q_numeric_2    0.023188
q_numeric_3   -0.009190
dtype: float64

## Build models

In [17]:
data_df = data_df.dropna()
X_train, X_test, y_train, y_test = train_test_split(data_df[feature_cols], data_df[class_col], test_size=0.33, random_state=42)

### Normalization

In [18]:
from sklearn.preprocessing import StandardScaler

ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)

ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

### Train models

In [19]:
from sklearn.linear_model import LogisticRegression
reg_log = LogisticRegression()
reg_log.fit(X_train, y_train)
predictions = reg_log.predict(X_test)

  y = column_or_1d(y, warn=True)


In [20]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy, precision, recall, f1, TN, FP, FN, TP = {}, {}, {}, {}, {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier
    models[key].fit(X_train, y_train)
    
    # Make predictions
    predictions = models[key].predict(X_test)
    
    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)
    f1[key] = f1_score(predictions,y_test)
    TN[key], FP[key], FN[key], TP[key] = confusion_matrix(y_test, predictions).ravel()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  models[key].fit(X_train, y_train)
  y = column_or_1d(y, warn=True)


In [22]:
import pandas as pd

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall','F1'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model['F1'] = f1.values()
df_model['TN'] = TN.values()
df_model['FP'] = FP.values()
df_model['FN'] = FN.values()
df_model['TP'] = TP.values()


df_model

Unnamed: 0,Accuracy,Precision,Recall,F1,TN,FP,FN,TP
Logistic Regression,0.546358,0.510791,0.507143,0.508961,94,69,68,71
Support Vector Machines,0.549669,0.52518,0.51049,0.51773,93,70,66,73
Decision Trees,0.509934,0.489209,0.468966,0.478873,86,77,71,68
Random Forest,0.536424,0.503597,0.496454,0.5,92,71,69,70
Naive Bayes,0.549669,0.453237,0.512195,0.480916,103,60,76,63
