In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
test_df = pd.read_csv('credit_scoring_test.csv', index_col='client_id')
train_df = pd.read_csv('credit_scoring_train.csv', index_col='client_id')

In [3]:
y = train_df['Delinquent90']

In [4]:
train_df.drop('Delinquent90', axis=1, inplace=True)

In [5]:
train_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028
1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679
2,2206.731199,55.5,21,1,,1,0,,0.348227
3,886.132793,55.3,3,0,0.0,0,0,,0.97193
4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000 entries, 0 to 74999
Data columns (total 9 columns):
DIR                      75000 non-null float64
Age                      75000 non-null float64
NumLoans                 75000 non-null int64
NumRealEstateLoans       75000 non-null int64
NumDependents            73084 non-null float64
Num30-59Delinquencies    75000 non-null int64
Num60-89Delinquencies    75000 non-null int64
Income                   60153 non-null float64
BalanceToCreditLimit     75000 non-null float64
dtypes: float64(5), int64(4)
memory usage: 5.7 MB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75000 entries, 75000 to 149999
Data columns (total 9 columns):
DIR                      75000 non-null float64
Age                      75000 non-null float64
NumLoans                 75000 non-null int64
NumRealEstateLoans       75000 non-null int64
NumDependents            72992 non-null float64
Num30-59Delinquencies    75000 non-null int64
Num60-89Delinquencies    75000 non-null int64
Income                   60116 non-null float64
BalanceToCreditLimit     75000 non-null float64
dtypes: float64(5), int64(4)
memory usage: 5.7 MB


In [8]:
train_df['NumDependents'].fillna(train_df['NumDependents'].median(), inplace=True)
train_df['Income'].fillna(train_df['Income'].median(), inplace=True)

In [9]:
test_df['NumDependents'].fillna(test_df['NumDependents'].median(), inplace=True)
test_df['Income'].fillna(test_df['Income'].median(), inplace=True)

In [10]:
train_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,0.496289,49.1,13,0,0.0,2,0,5298.360639,0.387028
1,0.433567,48.0,9,2,2.0,1,0,6008.056256,0.234679
2,2206.731199,55.5,21,1,0.0,1,0,5424.552473,0.348227
3,886.132793,55.3,3,0,0.0,0,0,5424.552473,0.97193
4,0.0,52.3,1,0,0.0,0,0,2504.613105,1.00435


In [11]:
test_df.head()

Unnamed: 0_level_0,DIR,Age,NumLoans,NumRealEstateLoans,NumDependents,Num30-59Delinquencies,Num60-89Delinquencies,Income,BalanceToCreditLimit
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
75000,0.488558,39.2,7,2,2.0,0,0,2866.926559,0.369443
75001,0.13281,42.3,8,0,1.0,4,0,4303.412944,1.028329
75002,1784.812905,51.5,5,1,0.0,0,0,5421.111494,0.081461
75003,0.538571,57.1,30,2,0.0,0,0,7672.29493,0.48585
75004,0.098539,70.1,3,0,0.0,0,0,4507.01036,0.004258


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [13]:
first_tree = DecisionTreeClassifier(max_depth=3, random_state=17)
first_tree.fit(train_df, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [14]:
first_tree_pred = first_tree.predict(test_df)

In [15]:
def write_to_submission_file(predicted_labels, out_file,
                             target='Delinquent90', index_label="client_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(75000, 
                                                  predicted_labels.shape[0] + 75000),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [16]:
write_to_submission_file(first_tree_pred, 'credit_scoring_first_tree.csv')

In [17]:
first_tree_pred_probs = first_tree.predict_proba(test_df)[:, 1]

In [18]:
write_to_submission_file(first_tree_pred_probs, 'credit_scoring_first_tree.csv')

In [19]:
tree_params = {'max_depth': list(range(3, 8)), 
               'min_samples_leaf': list(range(5, 13))}

locally_best_tree = GridSearchCV(first_tree, tree_params, cv = 5, n_jobs=-1)
locally_best_tree.fit(train_df, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [3, 4, 5, 6, 7], 'min_samples_leaf': [5, 6, 7, 8, 9, 10, 11, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [20]:
locally_best_tree.best_params_, round(locally_best_tree.best_score_, 3)

({'max_depth': 5, 'min_samples_leaf': 11}, 0.934)

In [21]:
tuned_tree_pred_probs = locally_best_tree.predict(test_df)
tuned_tree_pred_probs

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
write_to_submission_file(tuned_tree_pred_probs, 'grid_search_tree.csv')

In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
first_forest = RandomForestClassifier(random_state=17)
first_forest.fit(train_df, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=17, verbose=0, warm_start=False)

In [27]:
first_forest_pred = first_forest.predict(test_df)
first_forest_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [28]:
write_to_submission_file(first_forest_pred, 'forest_w_maxdepth.csv')

In [29]:
forest_params = {'max_features': np.linspace(.3, 1, 7)}
locally_best_forest = GridSearchCV(first_forest,forest_params, n_jobs=-1, cv = 5)
locally_best_forest.fit(train_df, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=17, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_features': array([0.3    , 0.41667, 0.53333, 0.65   , 0.76667, 0.88333, 1.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
locally_best_forest.best_params_, round(locally_best_forest.best_score_, 3)

({'max_features': 0.3}, 0.932)

In [31]:
tuned_forest_pred = locally_best_forest.predict(test_df)

In [32]:
write_to_submission_file(tuned_forest_pred, 'tuned_forest_pred.csv')

In [33]:
pd.DataFrame(locally_best_forest.best_estimator_.feature_importances_)

Unnamed: 0,0
0,0.166343
1,0.161029
2,0.093172
3,0.033461
4,0.041417
5,0.058877
6,0.062481
7,0.150709
8,0.232511


In [34]:
final_forest = RandomForestClassifier(n_estimators=300, max_features = 3, random_state=42,n_jobs=-1, oob_score=True) # Ваш код здесь
final_forest.fit(train_df, y)
final_forest_pred = final_forest.predict_proba(test_df)[:, 1]
write_to_submission_file(final_forest_pred, 'credit_scoring_final_forest.csv')