In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Functions

In [2]:
def scoring(y_true, y_pred):
    
    print(f'accuracy: {accuracy_score(y_true, y_pred):.5f}')
    print(f'precision: {precision_score(y_true, y_pred):.5f}')
    print(f'recall: {recall_score(y_true, y_pred):.5f}')
    print(f'f1: {f1_score(y_true, y_pred):.5f}')    

# Data

In [3]:
df = pd.read_csv('/home/dmdp/workspace/datasets/car_loan_default/v1/train.zip', low_memory=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121856 entries, 0 to 121855
Data columns (total 40 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   id                          121856 non-null  int64  
 1   client_income               118234 non-null  float64
 2   car_owned                   118275 non-null  float64
 3   bike_owned                  118232 non-null  float64
 4   active_loan                 118221 non-null  float64
 5   house_own                   118195 non-null  float64
 6   child_count                 118218 non-null  float64
 7   credit_amount               118219 non-null  float64
 8   loan_annuity                117030 non-null  float64
 9   accompany_client            120110 non-null  object 
 10  client_income_type          118155 non-null  object 
 11  client_education            118211 non-null  object 
 12  client_marital_status       118383 non-null  object 
 13  client_gender 

In [5]:
df['credit_amount_to_income'] = df['credit_amount'] / df['client_income']

In [6]:
df.groupby('default')['credit_amount_to_income'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,105408.0,3.972049,2.708251,0.0375,2.018667,3.273971,5.200337,49.2272
1,9300.0,3.912652,2.602159,0.2264,2.128121,3.309714,5.0,28.392857


# Models

## M1: All Rows, Naive Selection of Features

In [7]:
clf_1 = LogisticRegression(random_state=0) #, class_weight='balanced')

In [8]:
SAMPLE_SIZE_0 = 30000

In [9]:
feature_columns = ['client_income', 'age_days', 'employed_days', 'credit_amount_to_income', 'credit_amount']
target_column = 'default'

dft = pd.concat([
    df.dropna(subset=feature_columns).query('default == 0').sample(SAMPLE_SIZE_0),
    df.dropna(subset=feature_columns).query('default == 1')
    ])[feature_columns + [target_column]]

X = dft[feature_columns]
y = dft.loc[dft.index][target_column]

In [10]:
dft.groupby('default')['client_income'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
default,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,30000.0,16937.785321,11099.608875,2565.0,11250.0,14850.0,20250.0,675000.0
1,8770.0,16055.474726,8284.006595,2790.0,11250.0,13500.0,20250.0,135000.0


In [11]:
clf_1 = clf_1.fit(X, y)
print(clf_1.classes_)

y_pred = clf_1.predict(X)

clf_1.score(X, y)

[0 1]


0.7737941707505803

In [12]:
scoring(y_true=y.to_numpy(), y_pred=y_pred)

accuracy: 0.77379
precision: 0.00000
recall: 0.00000
f1: 0.00000


  _warn_prf(average, modifier, msg_start, len(result))


## M2: All Rows, Strong Features

In [13]:
clf_2 = LogisticRegression(random_state=0) #, class_weight='balanced')

In [14]:
SAMPLE_SIZE_0 = 30000

In [15]:
feature_columns = ['loan_annuity', 'age_days', 'employed_days', 'phone_change', 'registration_days', 'credit_amount']
target_column = 'default'

dft = pd.concat([
    df.dropna(subset=feature_columns).query('default == 0').sample(SAMPLE_SIZE_0),
    df.dropna(subset=feature_columns).query('default == 1')
    ])[feature_columns + [target_column]]

X = dft[feature_columns]
y = dft.loc[dft.index][target_column]

In [16]:
X.shape

(38141, 6)

In [17]:
clf_2 = clf_2.fit(X, y)
print(clf_2.classes_)

y_pred = clf_2.predict(X)

clf_2.score(X, y)

[0 1]


0.7865551506253113

In [18]:
scoring(y_true=y.to_numpy(), y_pred=y_pred)

accuracy: 0.78656
precision: 0.00000
recall: 0.00000
f1: 0.00000


  _warn_prf(average, modifier, msg_start, len(result))


## M3: Selected Rows

In [44]:
dft = df.query('client_income >= 20000 and client_income <= 80000').sort_values(by='credit_amount_to_income').copy()

In [45]:
SAMPLE_SIZE_0 = 1000
SAMPLE_SIZE_1 = 1000

In [46]:
feature_columns = ['loan_annuity', 'age_days', 'employed_days', 'phone_change', 'registration_days', 'credit_amount']
target_column = 'default'

dft = pd.concat([
    dft.dropna(subset=feature_columns).query('credit_amount_to_income <= 2 and default == 0').sample(SAMPLE_SIZE_0, random_state=42),
    dft.dropna(subset=feature_columns).query('credit_amount_to_income > 2 and default == 1').sample(SAMPLE_SIZE_1, random_state=42)
    ])[feature_columns + [target_column]]

X = dft[feature_columns]
y = dft.loc[dft.index][target_column]

In [47]:
dft

Unnamed: 0,loan_annuity,age_days,employed_days,phone_change,registration_days,credit_amount,default
29951,3163.05,17684.0,1609.0,957.0,1665.0,52128.00,0
73472,2074.50,22222.0,365243.0,0.0,8986.0,46800.00,0
93970,5332.95,14545.0,637.0,3274.0,8689.0,67500.00,0
12545,1125.00,14280.0,4459.0,482.0,8204.0,22500.00,0
12327,2446.20,19540.0,2001.0,1991.0,6016.0,50849.55,0
...,...,...,...,...,...,...,...
106506,2534.85,14763.0,713.0,654.0,7355.0,51846.30,1
65802,3317.40,14258.0,820.0,1594.0,8366.0,102429.00,1
115810,4500.00,15825.0,1686.0,551.0,6434.0,90000.00,1
111978,5909.40,15187.0,2537.0,1884.0,297.0,74628.00,1


In [48]:
clf_3 = LogisticRegression(random_state=0) #, class_weight='balanced')

In [49]:
clf_3 = clf_3.fit(X, y)
print(clf_3.classes_)

y_pred = clf_3.predict(X)

clf_3.score(X, y)

[0 1]


0.831

In [50]:
scoring(y_true=y.to_numpy(), y_pred=y_pred)

accuracy: 0.83100
precision: 0.83034
recall: 0.83200
f1: 0.83117


In [51]:
len(y)

2000

# Leftovers - can be ignored

In [None]:
df

Unnamed: 0,client_income,credit_amount,employed_days,default
0,31500,111384,561,0
1,30600,67500,1660,0
2,40500,49752,6065,0
3,38250,67500,675,0
4,36000,71509,3273,0
5,31500,122301,810,0
6,40950,176211,1996,0
7,36000,13500,735,1
8,31500,197107,1624,1
9,36000,27846,3318,0


In [167]:
feature_columns = ['client_income', 'credit_amount', 'employed_days']
target_column = 'default'

X = df[feature_columns]
y = df[target_column]

In [169]:
clf_4 = LogisticRegression(random_state=0) #, class_weight='balanced')

clf_4 = clf_4.fit(X, y)
print(clf.classes_)

y_pred = clf.predict(X)

clf_4.score(X, y)

[0 1]


0.8

In [170]:
y

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    1
8    1
9    0
Name: default, dtype: int64

In [48]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [49]:
y_proba_1 = clf.predict_proba(X)[:, 1]

In [50]:
threshold = 0.5 
y_pred = [0 if p < threshold else 1 for p in y_proba_1]

In [51]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y, y_pred)
print(acc)

0.8


In [52]:
from sklearn.metrics import accuracy_score

threshold = 0.55
y_pred = [0 if p < threshold else 1 for p in y_proba_1]
acc = accuracy_score(y, y_pred)
print(acc)

0.8


In [53]:
y

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    1
8    1
9    0
Name: default, dtype: int64

In [54]:
y_pred

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
from sklearn.metrics import accuracy_score

# прогнозные значения
y_pred = [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
# истинные значения
y_true = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]

acc = accuracy_score(y_true, y_pred)

print(acc)

In [409]:
import numpy as np

In [417]:
X = [[176], [165], [170], [168], [180], [174], [179], [176], [171]]
y = [1, 0, 1, 0, 1, 0, 1, 0, 0]

clf = clf.fit(X, y)
print(clf.classes_)

y_pred = clf.predict(X)
print(y_pred)

clf.score(X, y)

[0 1]
[1 0 0 0 1 0 1 1 0]


0.7777777777777778

In [418]:
y_proba = clf.predict_proba(X)

In [421]:
print(y_proba[:, 1])

[0.61826144 0.05425135 0.20752177 0.12484841 0.84513093 0.46874266
 0.801107   0.61826144 0.26187523]


In [402]:
clf.predict_proba(X)

array([[0.35737282, 0.64262718],
       [0.51049244, 0.48950756],
       [0.79129363, 0.20870637],
       [0.46255438, 0.53744562],
       [0.60792128, 0.39207872],
       [0.35054259, 0.64945741],
       [0.3287257 , 0.6712743 ],
       [0.5743567 , 0.4256433 ],
       [0.26116408, 0.73883592],
       [0.69276404, 0.30723596]])

In [None]:
from sklearn.metrics import accuracy_score

# прогнозные значения
y_pred = [0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
# истинные значения
y_true = [0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]

acc = accuracy_score(y_true, y_pred)

print(acc)

In [455]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# данные
df = pd.DataFrame([
    [ 31500, 111384,   561, 0],
    [ 30600,  67500,  1660, 0],
    [ 40500,  49752,  6065, 0],
    [ 38250,  67500,   675, 0],
    [ 36000,  71509,  3273, 0],
    [ 31500, 122301,   810, 1],
    [ 40950, 176211,  1996, 1],
    [ 36000,  13500,   735, 1],
    [ 31500, 197107,  1624, 1],
    [ 36000,  27846,  3318, 1]],
    columns=['client_income', 'credit_amount', 'employed_days', 'default'])

# разбиваем данные на признаки и таргет
X = df[['client_income', 'credit_amount', 'employed_days']]
y = df['default']

# обучаем модель на данных
clf = clf.fit(X, y)

# получаем прогнозные метки классов
y_pred = clf.predict(X)

# выводим прогнозные метки классов, любопытно посмотреть
print(y_pred)

# проводим оценку точности
acc = accuracy_score(y, y_pred)
print(acc)

[1 0 0 1 0 1 1 0 1 0]
0.6


# Banknote Dataset

In [52]:
df = pd.read_csv('/home/dmdp/workspace/datasets/banknote/data_banknote_authentication.txt', low_memory=False, header=None, names=['f1','f2','f3','f4','is_fake'])

In [53]:
df

Unnamed: 0,f1,f2,f3,f4,is_fake
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1367,0.40614,1.34920,-1.4501,-0.55949,1
1368,-1.38870,-4.87730,6.4774,0.34179,1
1369,-3.75030,-13.45860,17.5932,-2.77710,1
1370,-3.56370,-8.38270,12.3930,-1.28230,1


In [58]:
clf_50 = LogisticRegression(random_state=0) #, class_weight='balanced')

In [59]:
feature_columns = ['f1','f2','f3','f4']
target_column = 'is_fake'

dft = df[feature_columns + [target_column]]

X = dft[feature_columns]
y = df.loc[dft.index][target_column]

In [60]:
y.mean()

0.4446064139941691

In [61]:
clf_50 = clf_50.fit(X, y)
print(clf_50.classes_)

y_pred = clf_50.predict(X)

[0 1]


In [62]:
scoring(y_true=y, y_pred=y_pred)

accuracy: 0.98980
precision: 0.98377
recall: 0.99344
f1: 0.98858
