In [52]:
cd D:\Users\Niels\Documents\Machine learning\python\ING

D:\Users\Niels\Documents\Machine learning\python\ING


In [53]:
import pandas as pd

print("reading the data...")
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print("Columns:", list(train_df.columns))
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

reading the data...
Columns: ['num_employees', 'country', 'industry_desc', 'A_score', 'B_score', 'C_score', 'D_score', 'revenue2014', 'revenue2015', 'revenue2016', 'bankrupt']
Train shape: (293122, 11)
Test shape: (206878, 11)


In [54]:
print("processing the industry_desc...")

# Get the list of industries with number of occurrences
industries = pd.value_counts(train_df["industry_desc"])

train_df["industry_score"]=""
for idx, total in industries.iteritems():
    subdata = train_df.loc[train_df["industry_desc"] == idx][["bankrupt"]]
    # Determine the number of bankruptcies and calculate default probability
    default_prob = 0
    if total > 0:
        counts = pd.value_counts(subdata["bankrupt"]).multiply(1./total)
        default_prob = counts.values[-1]
    if default_prob < 0.05:
        train_df.loc[train_df["industry_desc"] == idx, ["industry_score"]] = "A"
    elif default_prob < 0.15:
        train_df.loc[train_df["industry_desc"] == idx, ["industry_score"]] = "B"
    elif default_prob < 0.3:
        train_df.loc[train_df["industry_desc"] == idx, ["industry_score"]] = "C"
    else:
        train_df.loc[train_df["industry_desc"] == idx, ["industry_score"]] = "D"
        
print("Columns:", list(train_df.columns))

processing the industry_desc...
Columns: ['num_employees', 'country', 'industry_desc', 'A_score', 'B_score', 'C_score', 'D_score', 'revenue2014', 'revenue2015', 'revenue2016', 'bankrupt', 'industry_score']


In [55]:
print("preprocessing...")

# Create dummy variables for the country category
train_df = pd.get_dummies(train_df.fillna(0), columns=["country"])
test_df = pd.get_dummies(test_df.fillna(0), columns=["country"])
#print("Columns:", list(train_df.columns))

# Create dummy variables for the industry category
train_df = pd.get_dummies(train_df.fillna(0), columns=["industry_score"])
#test_df = pd.get_dummies(test_df.fillna(0), columns=["industry_score"])
print("Columns:", list(train_df.columns))

preprocessing...
Columns: ['num_employees', 'industry_desc', 'A_score', 'B_score', 'C_score', 'D_score', 'revenue2014', 'revenue2015', 'revenue2016', 'bankrupt', 'country_CN', 'country_EN', 'country_NL', 'country_TR', 'country_US', 'industry_score_A', 'industry_score_B', 'industry_score_C', 'industry_score_D']


In [56]:
import numpy as np
from sklearn import preprocessing


# Revenues slopes
slopes=[]
for index, d in train_df.iterrows():
    #m = np.mean(d.values[6:9])
    #b =np.polyfit(np.array([1, 2, 3,]) , d.values[6:9], 1)
    m = np.mean(np.array([d.values[6],d.values[8]]))
    b =np.polyfit(np.array([1, 2]), np.array([d.values[6],d.values[8]]), 1)
    slopes.append(b[0]/m)

train_df = train_df.join(pd.DataFrame(slopes, columns=["rev_slope"]))
train_df["rev_slope"]= preprocessing.scale(train_df["rev_slope"])

print("Columns:", list(train_df.columns))

In [67]:
import numpy as np
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

# Scale data. Employees
min_employees = np.min(train_df["num_employees"].values)
train_df["num_employees"] = train_df["num_employees"].apply(lambda x: np.log(x - min_employees + 1))
train_df["num_employees"] = preprocessing.scale(train_df["num_employees"])

# Scale data. Revenues
for revenue in ['revenue2014', 'revenue2015', 'revenue2016']:
    min_revenues = np.min(train_df[revenue].values)
    train_df[revenue] = train_df[revenue].apply(lambda x: np.log(x - min_revenues + 1))
    train_df[revenue] = preprocessing.scale(train_df[revenue])

In [70]:
# Fill missing score
score_labels = ["A_score","B_score","C_score","D_score"]
for score in score_labels:
    train_df.loc[:,[score]] = train_df.fillna({score:3})
    train_df[score] = preprocessing.scale(train_df[score])

In [71]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
target = "bankrupt"
#training_df, validation_df = train_test_split(train_df, test_size=0.2, stratify=train_df[target], shuffle=True, random_state=0)
training_df, validation_df = train_test_split(train_df, test_size=0.2, stratify=train_df[target], random_state=0)
print("Train shape:", training_df.shape)
print("Validation shape:", validation_df.shape)
print(training_df.head(30))

Train shape: (234497, 20)
Validation shape: (58625, 20)
        num_employees                                      industry_desc  \
2311        -1.379750  Drycleaning and Laundry Services (except Coin-...   
259077      -0.001787              Nitrogenous Fertilizer Manufacturing    
91518        0.448028            Musical Instrument and Supplies Stores    
212468       0.123678  Services for the Elderly and Persons with Disa...   
38033       -0.922919  Teleproduction and Other Postproduction Services    
208259      -0.922919  Oil and Gas Pipeline and Related Structures Co...   
185294       0.150343                         Drilling Oil and Gas Wells   
21961        0.653408               Frozen Specialty Food Manufacturing    
182748       0.918463                                    Radio Networks    
197064      -0.534197                                       Nail Salons    
66624       -0.904600                                      Linen Supply    
126151      -1.067914           

In [72]:
# Oversample the training set defaults
from imblearn.over_sampling import SMOTE

numeric_features = ["A_score", "B_score", "C_score", "D_score", "num_employees",
                    "country_CN", "country_EN", "country_NL", "country_TR", "country_US",
                    "revenue2014", "revenue2015", "revenue2016"]

numeric_features = ["A_score", "B_score", "C_score", "D_score", "num_employees",
                    "country_CN", "country_EN", "country_NL", "country_TR", "country_US",
                    "revenue2014", "revenue2016", "rev_slope",
                    'industry_score_A', 'industry_score_B', 'industry_score_C', 'industry_score_D']

#numeric_features = ["A_score", "B_score", "C_score", "D_score","num_employees",
#                    "country_CN", "country_EN", "country_NL", "country_TR", "country_US",
#                    "revenue2016","rev_slope"]

sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(training_df[numeric_features], training_df[target])
print("Train shape:", x_train_res.shape)



Train shape: (431158, 17)


In [73]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver="liblinear")
print("training...")
lr.fit(x_train_res, y_train_res)

print("Feature weights:")
for feature, weight in zip(numeric_features, lr.coef_[0].tolist()):
    print(feature, weight)

training...
Feature weights:
A_score -0.06777215291451928
B_score -0.06131253709324207
C_score 0.019595112555444938
D_score -0.09055786170490317
num_employees 0.868856141913005
country_CN 0.25731483782466635
country_EN -0.5086707732194572
country_NL 0.4078029723605832
country_TR 0.09452081173107928
country_US -0.2592105680793795
revenue2014 0.10891324730544641
revenue2016 -0.6067809674387074
rev_slope -0.13054073732652324
industry_score_A -1.626529271517716
industry_score_B -0.6446188393953758
industry_score_C 0.2755390852600605
industry_score_D 1.9873663062688407


In [74]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

validation_df["pred"] = lr.predict_proba(validation_df[numeric_features])[:, 1]

print("Validation score:", roc_auc_score(validation_df[target], validation_df["pred"]))
#print("Validation score:", recall_score(validation_df[target], validation_df["pred"]))

Validation score: 0.870111902263955


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [75]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(n_estimators=50, random_state=0)
clf_rf.fit(x_train_res, y_train_res)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [76]:
validation_df["pred"] = clf_rf.predict(validation_df[numeric_features])

print("Validation score:", roc_auc_score(validation_df[target], validation_df["pred"]))

Validation score: 0.8478493143134985


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [77]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

dt_stump = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1)
ada_discrete = AdaBoostClassifier(base_estimator=dt_stump, learning_rate=1, n_estimators=50, algorithm="SAMME")
ada_discrete.fit(x_train_res, y_train_res)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=50, random_state=None)

In [78]:
validation_df["pred"] = ada_discrete.predict(validation_df[numeric_features])

print("Validation score:", roc_auc_score(validation_df[target], validation_df["pred"]))

Validation score: 0.8330371266500303


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [83]:
print(x_train_res[0:5])

[[ 2.07387239e-01 -9.64507187e-01  2.19037756e-01 -6.98225735e-01
  -1.37974998e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  1.45894978e+00  1.28679709e+00
  -5.05250600e-01  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-4.79867713e-01 -2.40214395e-01  2.19037756e-01 -6.98225735e-01
  -1.78704026e-03  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -3.21433378e-01 -8.49391977e-01
  -5.47783253e-01  1.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [-1.85437762e+00  4.84078397e-01 -5.08215033e-01  1.26132545e+00
   4.48027680e-01  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00 -1.94653473e-01  4.44800332e-01
   8.38291364e-01  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00]
 [ 1.58189714e+00  1.93266398e+00  9.46290546e-01  1.26132545e+00
   1.23677651e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  

In [85]:
from sklearn import svm

clf = svm.SVC(gamma='auto')
clf.fit(x_train_res, y_train_res)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [86]:
validation_df["pred"] = clf.predict(validation_df[numeric_features])

print("Validation score:", roc_auc_score(validation_df[target], validation_df["pred"]))

Validation score: 0.8587933588664985


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [None]:
test_df[target] = lr.predict_proba(test_df[numeric_features])[:, 1]

print("creating submission...")
test_df[["id", target]].to_csv("lr_submission.csv", index=False)