In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd
import requests
import csv
import kaggle 
import pickle
from joblib import dump, load

# Modelling Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn import preprocessing 


# Modelling Helpers
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import  Normalizer , scale
from sklearn.model_selection import train_test_split , StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import auc, roc_curve, roc_auc_score
# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 8 , 6

In [2]:
#kaggle.api.authenticate()
#kaggle.api.dataset_download_files('glebsolomennikov/main-task', path='data/', unzip=True)

In [3]:
data = pd.read_csv('data/data.csv')
model = pd.read_csv('data/model.csv')

In [4]:
data = data.merge(model, how='inner', left_on = 'incident', right_on = 'incident')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82617 entries, 0 to 82616
Columns: 1911 entries, incident to PD
dtypes: float64(1838), int64(72), object(1)
memory usage: 1.2+ GB


In [6]:
for_pred = data[-1000:]
data = data[:-1000]
data['target'] = abs(data.flg_90_12_add - data.PD)
data = data.replace(np.nan, 0)

In [7]:
del data['flg_90_12_add']
del data['PD']

In [8]:
le = preprocessing.LabelEncoder()

In [9]:
X = data.apply(le.fit_transform)
from sklearn.decomposition import PCA
pca = PCA(n_components = 1)
XPCAreduced = pca.fit_transform(np.transpose(X))

In [10]:
y =X['target']
del X['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
X_train.shape, X_test.shape

((61212, 1909), (20405, 1909))

In [11]:
estimators = [
    ('lr', RidgeCV()),
    ('svr', LinearSVR(random_state=421) )]
linreg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42))

In [12]:
%%time
linreg.fit(X_train, y_train)

Wall time: 7min 15s


StackingRegressor(estimators=[('lr', RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                              ('svr', LinearSVR(random_state=421))],
                  final_estimator=RandomForestRegressor(n_estimators=10,
                                                        random_state=42))

In [13]:
for_pred_mod = for_pred.apply(le.transform)
del for_pred_mod['flg_90_12_add']
del for_pred_mod['PD']


ValueError: y contains previously unseen labels: [20264239, 20264362, 20264564, 20266924, 20268570, 20268684, 20269502, 20270248, 20271314, 20272681, 20272839, 20272945, 20273658, 20276555, 20277303, 20277337, 20282461, 20282864, 20283051, 20283238, 20283404, 20284017, 20284370, 20284955, 20285052, 20286298, 20288859, 20290047, 20290596, 20292154, 20293464, 20293899, 20296354, 20296713, 20296714, 20296949, 20300980, 20301523, 20301891, 20302856, 20303170, 20303324, 20304985, 20305297, 20306632, 20307040, 20307310, 20307689, 20307785, 20310375, 20311581, 20311635, 20313051, 20314745, 20315398, 20315821, 20316598, 20318515, 20319470, 20321060, 20322190, 20322655, 20323796, 20323885, 20324008, 20325270, 20327088, 20328803, 20329876, 20330172, 20333464, 20333622, 20338377, 20338457, 20339975, 20345113, 20346420, 20347296, 20347499, 20348084, 20348394, 20348581, 20348797, 20350683, 20352121, 20354900, 20356916, 20358015, 20358028, 20359041, 20359663, 20360080, 20360149, 20360992, 20361111, 20361140, 20361427, 20361869, 20365215, 20365274, 20370962, 20372393, 20372921, 20375183, 20375205, 20375621, 20378378, 20379375, 20379679, 20380255, 20381168, 20381438, 20382356, 20383164, 20383258, 20383936, 20386051, 20389283, 20390519, 20391012, 20408788, 20411327, 20412854, 20415874, 20416206, 20421233, 20424423, 20428171, 20432590, 20436790, 20437580, 20437852, 20446342, 20449578, 20451618, 20459540, 20460532, 20479250, 20486904, 20488053, 20494163, 20499766, 20500798, 20501217, 20501357, 20501636, 20502677, 20502857, 20505057, 20508629, 20510151, 20510796, 20511076, 20512144, 20513301, 20516454, 20517278, 20520293, 20521373, 20521536, 20521721, 20523502, 20524072, 20524976, 20526151, 20526173, 20526263, 20527199, 20527351, 20528011, 20528878, 20529632, 20529772, 20529971, 20530209, 20531810, 20532325, 20532624, 20533566, 20534240, 20534383, 20534464, 20536745, 20537827, 20538024, 20538304, 20540832, 20542683, 20543884, 20547273, 20547510, 20547724, 20548376, 20549013, 20550185, 20550510, 20551024, 20551529, 20552076, 20553738, 20553847, 20556929, 20557786, 20558638, 20559842, 20564078, 20564200, 20565108, 20567158, 20568760, 20570326, 20570470, 20572340, 20572617, 20573213, 20573969, 20574029, 20574262, 20576427, 20576817, 20577880, 20578494, 20580671, 20580787, 20580909, 20581075, 20581314, 20581505, 20582592, 20583964, 20585109, 20587627, 20587806, 20591607, 20592198, 20592539, 20593383, 20601697, 20605945, 20606602, 20607467, 20609472, 20615766, 20616217, 20617280, 20618329, 20619372, 20621145, 20621876, 20622248, 20622733, 20624177, 20625539, 20625566, 20625677, 20625694, 20626211, 20626286, 20626401, 20630020, 20631472, 20631744, 20635252, 20636532, 20637710, 20638435, 20640427, 20643356, 20643430, 20646378, 20646415, 20648695, 20649181, 20649636, 20649674, 20650837, 20652117, 20654538, 20654913, 20659325, 20660509, 20661718, 20665594, 20666025, 20668460, 20668619, 20668954, 20670529, 20671627, 20671959, 20674190, 20674269, 20675558, 20675787, 20678095, 20679887, 20682068, 20682775, 20683355, 20684327, 20684666, 20684934, 20684984, 20686588, 20690648, 20695301, 20696095, 20696647, 20698306, 20698386, 20699079, 20701535, 20702227, 20702371, 20703537, 20705829, 20706298, 20706991, 20707403, 20707458, 20707959, 20708510, 20710076, 20712807, 20715492, 20716172, 20716869, 20717579, 20720630, 20721012, 20722815, 20725210, 20726097, 20728336, 20729903, 20730113, 20730505, 20731010, 20731144, 20731408, 20734815, 20741546, 20743143, 20745598, 20746218, 20746790, 20748450, 20748579, 20749212, 20749299, 20749523, 20749769, 20750401, 20750769, 20752403, 20753310, 20754584, 20756904, 20758012, 20760113, 20762520, 20763686, 20764108, 20765650, 20766096, 20766151, 20766557, 20767541, 20768787, 20768940, 20769759, 20771650, 20771691, 20774136, 20776537, 20779915, 20780729, 20780894, 20781883, 20784234, 20785820, 20787198, 20788093, 20789060, 20789311, 20792017, 20792104, 20793018, 20793369, 20793816, 20793974, 20796660, 20798357, 20801365, 20804262, 20804273, 20804859, 20805117, 20808584, 20809770, 20811090, 20811788, 20813349, 20813515, 20813748, 20814664, 20814717, 20816046, 20816322, 20817713, 20820927, 20820947, 20823556, 20829709, 20829876, 20830649, 20831265, 20831921, 20832102, 20833470, 20833849, 20833889, 20834616, 20834711, 20836899, 20838343, 20838353, 20838524, 20838774, 20839171, 20840631, 20840733, 20840990, 20841968, 20844438, 20844915, 20845430, 20847107, 20847714, 20848068, 20848298, 20848380, 20852840, 20854935, 20855939, 20855972, 20857869, 20858402, 20859344, 20859868, 20860731, 20861879, 20862457, 20862985, 20864161, 20866367, 20866372, 20867163, 20868087, 20868266, 20868282, 20869735, 20869762, 20872863, 20873408, 20876498, 20877762, 20878677, 20881154, 20885897, 20887642, 20891932, 20892368, 20892736, 20899379, 20899569, 20900365, 20903391, 20905521, 20910390, 20916514, 20916683, 20917173, 20918678, 20919985, 20924006, 20931753, 20934823, 20935732, 20936096, 20936174, 20939408, 20940975, 20941789, 20944654, 20945699, 20947187, 20947206, 20948932, 20955237, 20957003, 20959013, 20960043, 20960217, 20960549, 20962605, 20963297, 20965875, 20968937, 20969929, 20970028, 20972886, 20974767, 20975971, 20978579, 20978721, 20982654, 20984668, 20988357, 20989877, 20996894, 20998781, 21000721, 21001147, 21001976, 21003292, 21004309, 21007978, 21009826, 21010045, 21010463, 21011395, 21012233, 21013363, 21016045, 21017953, 21019105, 21020737, 21020749, 21022116, 21022351, 21023787, 21025011, 21025740, 21026805, 21028027, 21035151, 21036058, 21036252, 21038262, 21038999, 21039558, 21039947, 21040299, 21041381, 21041876, 21043668, 21046999, 21047640, 21050478, 21050811, 21051162, 21051746, 21052499, 21053093, 21053460, 21056310, 21056464, 21056531, 21057825, 21063589, 21069733, 21071315, 21071886, 21072532, 21073617, 21075796, 21075844, 21076096, 21076462, 21077766, 21080660, 21082118, 21084988, 21086935, 21087931, 21087951, 21090343, 21091059, 21094255, 21094397, 21096244, 21096555, 21097848, 21098132, 21099196, 21099726, 21100429, 21102719, 21103528, 21104629, 21107344, 21112337, 21112558, 21112843, 21113031, 21113724, 21115359, 21116956, 21117109, 21117624, 21118162, 21120427, 21122185, 21122786, 21123454, 21123558, 21123736, 21124384, 21124782, 21128108, 21129024, 21131114, 21131589, 21131719, 21133919, 21134062, 21136056, 21136413, 21136489, 21137482, 21140133, 21142216, 21143574, 21144765, 21146479, 21148213, 21150490, 21150558, 21151127, 21151161, 21151529, 21151552, 21152237, 21157065, 21158353, 21161648, 21161712, 21162458, 21163799, 21164879, 21166376, 21171374, 21172294, 21174240, 21175903, 21176595, 21178364, 21180105, 21180922, 21181431, 21183392, 21186124, 21191209, 21191378, 21192793, 21194230, 21195512, 21197783, 21198039, 21199056, 21200629, 21200867, 21209705, 21210288, 21210849, 21210863, 21211716, 21213406, 21213705, 21214952, 21216777, 21217968, 21220679, 21221967, 21222795, 21224067, 21224582, 21226643, 21226666, 21227174, 21228031, 21228865, 21233187, 21233356, 21234432, 21234468, 21235377, 21236399, 21236758, 21237316, 21238346, 21239096, 21239875, 21239969, 21240215, 21242319, 21244111, 21244223, 21248388, 21248556, 21248733, 21250366, 21251130, 21251212, 21251774, 21254209, 21255147, 21255685, 21256021, 21256091, 21259394, 21259893, 21260118, 21264146, 21264189, 21265081, 21266638, 21267231, 21269567, 21270752, 21275591, 21276286, 21277143, 21278128, 21278301, 21278349, 21278643, 21279270, 21282602, 21288312, 21288617, 21289094, 21289175, 21290625, 21291501, 21292589, 21293590, 21293826, 21293865, 21294545, 21298798, 21298947, 21301482, 21301489, 21302126, 21304987, 21307928, 21310091, 21312027, 21314438, 21315122, 21315311, 21320053, 21321429, 21321630, 21321783, 21322820, 21324090, 21324202, 21327517, 21335552, 21338055, 21341614, 21342511, 21343202, 21343729, 21344433, 21345409, 21352309, 21356761, 21357181, 21360745, 21361619, 21363375, 21363768, 21364744, 21366250, 21366463, 21368971, 21369172, 21370358, 21370518, 21370948, 21371777, 21372145, 21374955, 21375925, 21377102, 21377658, 21378048, 21381000, 21386166, 21387551, 21388444, 21391085, 21391413, 21392265, 21392868, 21393485, 21399222, 21399630, 21400062, 21405521, 21407636, 21408048, 21413117, 21426450, 21434842, 21434844, 21435501, 21436348, 21436381, 21437426, 21443429, 21445030, 21445418, 21447778, 21448719, 21449406, 21449434, 21452003, 21452197, 21453347, 21453494, 21453940, 21457217, 21458114, 21458474, 21460429, 21461647, 21462698, 21463412, 21464768, 21473886, 21474018, 21476458, 21476563, 21477438, 21480912, 21481638, 21482104, 21488393, 21491026, 21493752, 21503499, 21503710, 21509293, 21512507, 21514008, 21516485, 21520439, 21521230, 21522135, 21524476, 21525644, 21527306, 21530126, 21530435, 21531281, 21534426, 21538351, 21539669, 21541226, 21541411, 21542432, 21543483, 21543728, 21545021, 21555201, 21558596, 21563893, 21564118, 21566032, 21570039, 21570103, 21570447, 21570496, 21575477, 21576195, 21576415, 21583785, 21584587, 21584827, 21592107, 21596282, 21596410, 21597069, 21597609, 21598014, 21600350, 21601062, 21604118, 21605038, 21608431, 21617226, 21618212, 21628442, 21631133, 21632589, 21632934, 21633094, 21633508, 21635756, 21637616, 21638216, 21639610, 21641962, 21643952, 21646131, 21646140, 21646141, 21649604, 21652866, 21653290, 21654632, 21655838, 21657081, 21658215, 21659521, 21660445, 21660916, 21662966, 21664051, 21670400, 21671078, 21673278, 21674311, 21681264, 21684760, 21685794, 21686218, 21686346, 21686830, 21688724, 21691291, 21693055, 21693347, 21693862, 21695276, 21696461, 21696720, 21701184, 21701892, 21708882, 21710711, 21713938, 21715275, 21715487, 21717236, 21720726, 21720933, 21721741, 21722387, 21725579, 21725762, 21726774, 21726819, 21728605, 21729644, 21730541, 21731743, 21732415, 21733011, 21733169, 21733746, 21735780, 21736131, 21738071, 21738843, 21740459, 21743426, 21750692, 21753972, 21755325, 21755519, 21757294, 21759356, 21761430, 21762527, 21763000, 21763280, 21763445, 21763881, 21765687, 21765972, 21768486, 21779313, 21787442, 21792767]

In [None]:

print('TRAIN MSE:', linreg.score(X_train, y_train))
print('TEST MSE:', linreg.score(X_test, y_test))
res = linreg.predict(for_pred_mod)
prediction = pd.DataFrame(res, columns=['predictions']).to_csv('ans_on_x_y.csv',encoding='utf-8', index=False)

In [None]:
len(res)

In [None]:
linreg.fit(X, y)

In [None]:
print('TRAIN MSE:', linreg.score(X_train, y_train))
print('TEST MSE:', linreg.score(X_test, y_test))

In [None]:
res = linreg.predict(for_pred_mod)
prediction = pd.DataFrame(res, columns=['predictions']).to_csv('ans_on_X.csv',encoding='utf-8', index=False)