In [1]:
import pandas as pd

In [2]:
# The csv file is large, please download it first then read from local files
# if you insist in reading online, read it from:
# https://raw.githubusercontent.com/Jeff-HOU/bpi.2018/master/data/data_q2.csv
# ?token=AilEXVc0O5KlZviHuRGCT7A4Luhovbleks5a8BUjwA%3D%3D

df_q2 = pd.read_csv('F:\\Competitions\\BPI\\data\\data_q2.csv')

In [3]:
bools = df_q2.select_dtypes(include='bool').columns.tolist()
bool_to_int = df_q2.loc[:, bools].astype(int)
df_q2 = df_q2.drop(columns=bools+['Unnamed: 0', 'case'])
df_q2 = pd.concat([df_q2, bool_to_int], axis=1)
df_q2.columns = df_q2.columns.str.replace('\s+', '_')

In [4]:
penalties = ['penalty_B3', 'penalty_B4', 'penalty_B5', 'penalty_B5F', 'penalty_B6', 'penalty_B16', 'penalty_BGK',
             'penalty_BGKV', 'penalty_C16', 'penalty_JLP3', 'penalty_V5']
attribs = ['number_parcels', 'area', 'young_farmer', 'small_farmer', 'redistribution']
depts = df_q2['department'].unique()

In [5]:
df_q2_d_list = []
for d in depts:
    df_q2_d_list.append([d, df_q2.loc[df_q2['department'] == d].drop(columns='department')])

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score,\
                            confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss

import numpy as np
np.seterr(all='raise')
import warnings

for k in range(0, len(depts)):
    df_q2_d = df_q2_d_list[k][1]
    print("\nFor department", df_q2_d_list[k][0])
    X = df_q2_d[attribs]
    for i in range(0, len(penalties)):
        y = df_q2_d[penalties[i]]
        if len(y.unique()) == 1: continue
        while True:
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            if len(y_train.unique()) > 1: break
        try:
            with warnings.catch_warnings(record=True):
                log_reg = LogisticRegression()
                log_reg.fit(X_train, y_train)
                y_pred = log_reg.predict(X_test)
                y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
                [fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
                print("\n{} cross-validation results:".format(penalties[i]))
                if len(y_test.unique()) == 1:
                    print("y_test only contains 0.")
                    continue
                print("{} accuracy is {:.3f}".format(log_reg.__class__.__name__, accuracy_score(y_test, y_pred)))
                print("{} log_loss is {:.3f}".format(log_reg.__class__.__name__, log_loss(y_test, y_pred_proba)))
                print("{} auc is {:.3f}".format(log_reg.__class__.__name__, auc(fpr, tpr)))
        except FloatingPointError as e:
            print(penalties[i], e)
        except:
            print(penalties[i], 'Warning!')


For department e7

penalty_B3 cross-validation results:
LogisticRegression accuracy is 0.983
LogisticRegression log_loss is 0.082
LogisticRegression auc is 0.675

penalty_B4 cross-validation results:
LogisticRegression accuracy is 0.997
LogisticRegression log_loss is 0.019
LogisticRegression auc is 0.930

penalty_B5 cross-validation results:
LogisticRegression accuracy is 0.997
LogisticRegression log_loss is 0.019
LogisticRegression auc is 0.740
penalty_B5F overflow encountered in exp

penalty_B6 cross-validation results:
LogisticRegression accuracy is 0.997
LogisticRegression log_loss is 0.020
LogisticRegression auc is 0.693

penalty_B16 cross-validation results:
LogisticRegression accuracy is 0.998
LogisticRegression log_loss is 0.016
LogisticRegression auc is 0.731

penalty_BGK cross-validation results:
LogisticRegression accuracy is 0.996
LogisticRegression log_loss is 0.026
LogisticRegression auc is 0.459

penalty_BGKV cross-validation results:
y_test only contains 0.

penalty_C1

In [7]:
import statsmodels.formula.api as smf

k = 0
i = 0
df_q2_d = df_q2_d_list[k][1]
if len(y.unique()) == 1: print("y can only be 0.")
logit_model = smf.logit(formula= penalties[i] + ' ~ ' + ' + '.join(attribs), data=df_q2_d).fit()
logit_model.summary2()

Optimization terminated successfully.
         Current function value: 0.077382
         Iterations 10


0,1,2,3
Model:,Logit,No. Iterations:,10.0
Dependent Variable:,penalty_B3,Pseudo R-squared:,0.058
Date:,2018-05-08 23:17,AIC:,2050.2297
No. Observations:,13170,BIC:,2095.1438
Df Model:,5,Log-Likelihood:,-1019.1
Df Residuals:,13164,LL-Null:,-1081.5
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-2.2339,0.7507,-2.9756,0.0029,-3.7052,-0.7625
number_parcels,0.0682,0.0130,5.2269,0.0000,0.0426,0.0937
area,-0.0368,0.0051,-7.2219,0.0000,-0.0468,-0.0268
young_farmer,0.4028,0.2353,1.7120,0.0869,-0.0583,0.8639
small_farmer,0.1031,0.2705,0.3813,0.7030,-0.4270,0.6333
redistribution,-1.4371,0.7538,-1.9066,0.0566,-2.9145,0.0402


In [8]:
X_cols_1 = ['number_parcels', 'area', 'redistribution']
if len(y.unique()) == 1: print("y can only be 0.")
logit_model_1 = smf.logit(formula= penalties[i] + ' ~ ' + ' + '.join(X_cols_1), data=df_q2_d).fit()
logit_model_1.summary2()

Optimization terminated successfully.
         Current function value: 0.077487
         Iterations 10


0,1,2,3
Model:,Logit,No. Iterations:,10.0
Dependent Variable:,penalty_B3,Pseudo R-squared:,0.056
Date:,2018-05-08 23:17,AIC:,2049.0085
No. Observations:,13170,BIC:,2078.9512
Df Model:,3,Log-Likelihood:,-1020.5
Df Residuals:,13166,LL-Null:,-1081.5
Converged:,1.0000,Scale:,1.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,-2.1944,0.7488,-2.9304,0.0034,-3.6620,-0.7267
number_parcels,0.0685,0.0131,5.2396,0.0000,0.0429,0.0941
area,-0.0368,0.0051,-7.2272,0.0000,-0.0467,-0.0268
redistribution,-1.4415,0.7534,-1.9134,0.0557,-2.9181,0.0351


In [None]:
# None of the 5 attributes can be left out, according to RFECV

# from sklearn.feature_selection import RFE, RFECV

# for k in range(0, len(depts)):
#     df_q2_d = df_q2_d_list[k][1]
#     print("\nFor department", df_q2_d_list[k][0])
#     X = df_q2_d[attribs]
#     for i in range(0, len(penalties)):
#         y = df_q2_d[penalties[i]]
#         try:
#             rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring='accuracy')
#             rfecv.fit(X, y)
#         except ValueError as err:
#             print('Training set contains only 0.')
#             continue

#         print("Optimal number of features: {}".format(rfecv.n_features_))
#         print('Selected features: {}'.format(list(X.columns[rfecv.support_])))

# # Plot number of features VS. cross-validation scores
# plt.figure(figsize=(10,6))
# plt.xlabel("Number of features selected")
# plt.ylabel("Cross validation score (nb of correct classifications)")
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
# plt.show()

In [None]:
# linear version

# for k in range(0, len(depts)):
#     df_q2_d = df_q2_d_list[k][1]
#     print("\nFor department", df_q2_d_list[k][0])
# #     X = df_q2_d[attribs]
#     for i in range(0, len(penalties)):
# #         y = df_q2_d[penalties[i]]

# #         X_train, X_test, y_train, y_test = train_test_split(X, y)
# #         train = pd.concat([y_train, X_train], axis=1)
#         lm = smf.ols(formula= penalties[i] + ' ~ ' + ' + '.join(attribs),
#                        data=df_q2_d).fit()

#         if not lm.centered_tss:
#             print(penalties[i], ': TSS = 0, same value for all obs.')
#         elif lm.rsquared_adj > 0.5:
#             print(penalties[i])