In [151]:
from __future__ import division, print_function
import warnings
warnings.filterwarnings('ignore')
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LassoCV, RidgeCV, Lasso, Ridge,  LogisticRegression, LogisticRegressionCV
from sklearn.metrics import mean_absolute_error, mean_squared_error,  f1_score, accuracy_score
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV 
import seaborn as sns
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
import xgboost as xgb

In [152]:
# dataset from https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients#
data = pd.read_csv("data/default of credit card clients.csv",  sep=';')
data.columns = list(data.iloc[0])
data = data.drop(0)

Джентельменский набор для кросс-валидации

In [153]:
X = data.drop('default payment next month',  axis=1)
y = data['default payment next month']
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)

датасет для результатов

In [154]:
results = pd.DataFrame(columns = ["model", "accuracy", "f1_score"])

Поехали!

In [155]:
def pipeline(model, model_name):
    model.fit(train_X, train_y)
    predictions = pd.Series(model.predict(test_X))
    accuracy = accuracy_score(test_y, predictions)
    f1score = f1_score(test_y,predictions, average="weighted")
    results.loc[len(results)] = [model_name,  accuracy, f1score]

### Логистическая регрессия

In [156]:
logistic_regression = LogisticRegression(C=0.1, n_jobs=-1, random_state=17)
pipeline(logistic_regression, "linear_regression")

### SVM

In [157]:
svc_classifier = LinearSVC()
pipeline(svc_classifier, "linear svc")

### Knn

In [158]:
knn_classifier = KNeighborsClassifier(n_neighbors=5)
pipeline(knn_classifier, "knn_classifier")

### Decision Tree

In [159]:
decision_classifier = DecisionTreeClassifier()
pipeline(decision_classifier, "decision_tree")

### Random Forest

In [160]:
random_forest = RandomForestClassifier(max_depth=2, random_state=0)
pipeline(random_forest, "random_forest")

### xgboost (бустинг)

In [161]:
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)
gbm.fit(train_X.as_matrix(), train_y.as_matrix())
predictions = gbm.predict(test_X.as_matrix())
acc = accuracy_score(test_y.as_matrix(), predictions)

f1s = f1_score([float(x) for x in predictions], [float(x) for x in test_y.as_matrix()]) 
# очень странный хак, fscore никак не хотел считаться, пришлось захардкодить так
results.loc[len(results)] = ["xgboost",  acc, f1s]

### catboost (бустинг)

In [162]:
catboost_classifier = CatBoostClassifier(learning_rate=1, depth=2, loss_function='MultiClass', iterations=50)
catboost_classifier.fit(train_X.as_matrix(), train_y.as_matrix())

predictions = catboost_classifier.predict(test_X)

predictions = np.concatenate(predictions)
test_y = [float(x) for x in list(test_y)]
acc = accuracy_score(preds, test_y)
f1s = f1_score(preds, test_y)
results.loc[len(results)] = ["catboost",  acc, f1s]

0:	learn: -0.4616568	total: 42ms	remaining: 2.06s
1:	learn: -0.4478509	total: 86ms	remaining: 2.06s
2:	learn: -0.4436145	total: 127ms	remaining: 2s
3:	learn: -0.4414908	total: 170ms	remaining: 1.95s
4:	learn: -0.4404160	total: 210ms	remaining: 1.89s
5:	learn: -0.4389239	total: 262ms	remaining: 1.92s
6:	learn: -0.4369893	total: 307ms	remaining: 1.89s
7:	learn: -0.4361928	total: 351ms	remaining: 1.84s
8:	learn: -0.4350004	total: 395ms	remaining: 1.8s
9:	learn: -0.4343219	total: 439ms	remaining: 1.75s
10:	learn: -0.4335078	total: 486ms	remaining: 1.72s
11:	learn: -0.4328730	total: 533ms	remaining: 1.69s
12:	learn: -0.4318132	total: 576ms	remaining: 1.64s
13:	learn: -0.4316347	total: 619ms	remaining: 1.59s
14:	learn: -0.4313837	total: 661ms	remaining: 1.54s
15:	learn: -0.4308777	total: 712ms	remaining: 1.51s
16:	learn: -0.4305176	total: 757ms	remaining: 1.47s
17:	learn: -0.4302589	total: 801ms	remaining: 1.42s
18:	learn: -0.4301209	total: 844ms	remaining: 1.38s
19:	learn: -0.4299416	total:

In [163]:
results

Unnamed: 0,model,accuracy,f1_score
0,linear_regression,0.784556,0.689951
1,linear svc,0.779333,0.689788
2,knn_classifier,0.751889,0.71992
3,decision_tree,0.724889,0.729523
4,random_forest,0.801444,0.74847
5,xgboost,0.824667,0.469401
6,catboost,0.822889,0.473232
