Импорт библиотек

In [91]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score
from sklearn.impute import SimpleImputer

Считывание датасета

In [92]:
df = pd.read_csv(r'cardio.csv', sep=';', index_col='id')
df.head(5)

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


In [94]:
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


Делим на тест и трейн

In [95]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['cardio']),
    df['cardio'],
    test_size=0.4,
    random_state=42
    )
X_test.to_csv("X_test.csv", index='id')
y_test.to_csv("y_test.csv", index='id')
X_train.to_csv("X_train.csv", index='id')
y_train.to_csv("y_train.csv", index='id')

Делаем пайплайн

In [96]:
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key


    def fit(self, X, y=None):
        return self


    def transform(self, X):
        return X[[self.key]]

In [97]:
target = 'cardio'
columns_to_scale = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo']
columns_a = ['cholesterol', 'gluc', 'smoke', 'alco', 'active']

In [98]:
feature_union_list = []
for col in columns_to_scale:
    pl = Pipeline([
        ('selector', NumberSelector(key=col)),
        ('scaler', StandardScaler())
    ])
    feature_union_list.append((col, pl))
for col in columns_a:
    pl = Pipeline([
        ('selector', NumberSelector(key=col))
    ])
    feature_union_list.append((col, pl))
feats = FeatureUnion(feature_union_list)

In [99]:
%%time
pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)

CPU times: user 3.05 s, sys: 0 ns, total: 3.05 s
Wall time: 3.06 s


Сохраняем модель

In [100]:
with open('gb_pipeline.dill', 'wb') as f:
    dill.dump(pipeline, f)

Проверяем модель на тестовой выборке

In [101]:
preds = pipeline.predict_proba(X_test)[:, 1]
pred_df = pd.DataFrame({'preds': preds})
# pred_df.to_csv("test_predictions.csv", index=None)

In [102]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.39124626988116257, F-Score=0.747, Precision=0.701, Recall=0.799
