In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd
import requests
import csv
import kaggle 
import pickle
from joblib import dump, load

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn import preprocessing 


# Modelling Helpers
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import  Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import auc, roc_curve, roc_auc_score
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [3]:
#kaggle.api.authenticate()
#kaggle.api.dataset_download_files('glebsolomennikov/main-task', path='data/', unzip=True)

In [4]:
data = pd.read_csv('data/data.csv')
model = pd.read_csv('data/model.csv')

In [5]:
data = data.merge(model, how='inner', left_on = 'incident', right_on = 'incident')

In [6]:
data

Unnamed: 0,incident,flg_90_12_add,APPLICATION_MONTH,Category_Feature_0,Category_Feature_1,Category_Feature_2,Category_Feature_3,Category_Feature_4,Category_Feature_5,Category_Feature_6,...,Feature_1879,Feature_1880,Feature_1881,Feature_1882,Feature_1883,Feature_1884,Feature_1885,Feature_1886,Feature_1887,PD
0,14511110,0.0,2018-10,6.0,2,25,2.0,0.0,3,1,...,,,,,0,0,0,1,20184,0.022191
1,15018800,0.0,2018-12,1.0,2,49,4.0,0.0,3,1,...,0.259385,0.547065,0.287680,0.287680,0,1,1,1,20184,0.019972
2,14499737,0.0,2018-10,6.0,2,24,1.0,0.0,3,0,...,0.420321,0.242687,,,0,1,0,1,20184,0.043884
3,14953427,0.0,2018-12,4.0,2,26,1.0,0.0,3,1,...,,,,,0,1,1,1,20184,0.013412
4,14744453,0.0,2018-11,6.0,2,30,4.0,2.0,3,1,...,0.045863,,,,0,1,0,1,20184,0.033354
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82612,20559842,,2020-01,1.0,2,34,2.0,2.0,3,0,...,0.358156,0.301572,0.187271,,0,0,0,1,20201,
82613,20380255,,2020-01,10.0,2,66,2.0,0.0,5,1,...,0.391072,0.391072,0.391072,0.391072,0,0,0,0,20201,
82614,20348581,,2020-01,4.0,1,32,1.0,0.0,3,0,...,,,,,0,1,1,1,20201,
82615,21118162,,2020-02,5.0,2,62,5.0,0.0,6,1,...,,,,,0,0,0,0,20201,


In [7]:
for_pred = data[-1000:]
data = data[:-1000]
data['target'] = abs(data.flg_90_12_add - data.PD)
data = data.replace(np.nan, 0)

In [8]:
data.describe()

Unnamed: 0,incident,flg_90_12_add,Category_Feature_0,Category_Feature_1,Category_Feature_2,Category_Feature_3,Category_Feature_4,Category_Feature_5,Category_Feature_6,Category_Feature_7,...,Feature_1880,Feature_1881,Feature_1882,Feature_1883,Feature_1884,Feature_1885,Feature_1886,Feature_1887,PD,target
count,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,...,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0,81617.0
mean,17603140.0,0.049401,5.575542,1.48311,41.008013,1.953649,0.531997,3.981156,0.442604,1.911783,...,0.35101,0.314185,0.353443,0.102185,0.390213,0.384332,0.512651,20193.098288,0.052692,0.086874
std,2006843.0,0.216706,5.094582,0.499718,13.024303,1.044419,1.201335,1.137605,0.496698,0.7794,...,39.161097,39.328381,47.358424,0.302893,0.487801,0.48644,0.499843,4.180258,0.076001,0.187584
min,14320480.0,0.0,0.0,1.0,21.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20184.0,0.006152,0.006152
25%,15963770.0,0.0,1.0,1.0,30.0,1.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20192.0,0.017027,0.017081
50%,17202160.0,0.0,4.0,1.0,39.0,2.0,0.0,3.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,20193.0,0.027629,0.027917
75%,19204920.0,0.0,9.0,2.0,51.0,2.0,1.0,5.0,1.0,2.0,...,0.169804,0.059304,0.0,0.0,1.0,1.0,1.0,20194.0,0.054658,0.058062
max,21796880.0,1.0,15.0,2.0,75.0,6.0,255.0,7.0,1.0,4.0,...,11031.295761,11031.295761,11031.295761,1.0,1.0,1.0,1.0,20201.0,0.983462,0.991814


In [9]:
del data['flg_90_12_add']
del data['PD']

In [10]:
le = preprocessing.LabelEncoder()

In [11]:
X = data.apply(le.fit_transform)
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
XPCAreduced = pca.fit_transform(np.transpose(X))

In [12]:
y =X['target']
del X['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.shape, X_test.shape

((61212, 1909), (20405, 1909))

In [13]:
estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=421) )]
linreg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42))

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LarsCV
estimators = [
    ('regr', RandomForestRegressor(max_depth=5, random_state=0)),
    ('lr', LinearRegression()),
    ('ls', LarsCV(cv=5, normalize=False)),
    ('extr', ExtraTreesRegressor(n_estimators=100, random_state=0)),
]
linreg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42))

In [None]:
%%time
linreg.fit(X_train, y_train)

In [None]:
print('TRAIN MSE:', linreg.score(X_train, y_train))
print('TEST MSE:', linreg.score(X_test, y_test))

In [None]:
linreg.fit(X, y)

In [None]:
print('TRAIN MSE:', linreg.score(X_train, y_train))
print('TEST MSE:', linreg.score(X_test, y_test))

In [None]:
for_pred_mod = for_pred.apply(le.fit_transform)

In [30]:
del for_pred_mod['flg_90_12_add']
del for_pred_mod['PD']

In [32]:
res = linreg.predict(for_pred_mod)

1000