In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/21/41/75b28629b9f2668548f431efe0236062aec12cd0a9a647313d7f2d1c9221/catboost-0.17.5-cp36-none-manylinux1_x86_64.whl (62.7MB)
[K     |████████████████████████████████| 62.7MB 830kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.17.5


In [2]:
from catboost import CatBoostClassifier, Pool
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score
import pandas as pd



In [0]:
def weight(event):
    if event == 1:
        return -10
    elif event == 2:
        return -0.1
    elif event == 3:
        return 0.1
    elif event == 4:
        return 0.5
    return None

def class_value(pred1, pred2, pred3):
    pred1 = pred1.astype(bool)
    pred2 = pred2.astype(bool)
    pred3 = pred3.astype(bool)
    return (~pred1) + 2 * (pred1 & (~pred2)) + 3 * (pred1 & pred2 & (~pred3)) + 4 * (pred1 & pred2 & pred3)

def pred(X_test):
    pred1 = model1.predict(X_test).astype(bool)
    pred2 = model2.predict(X_test).astype(bool)
    pred3 = model3.predict(X_test).astype(bool)
    return (~pred1) + 2 * (pred1 & (~pred2)) + 3 * (pred1 & pred2 & (~pred3)) + 4 * (pred1 & pred2 & pred3)

In [0]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
link = 'https://drive.google.com/open?id=1AHDxRu5tLjSVt9KR02MCz2x4MVYjvzrf'

In [6]:
fluff, id = link.split('=')
print (id)

1AHDxRu5tLjSVt9KR02MCz2x4MVYjvzrf


In [0]:
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train_dataset.csv')  
data = pd.read_csv('train_dataset.csv')
# Dataset is now stored in a Pandas Dataframe

In [0]:
data.drop('Unnamed: 0', axis=1, inplace=True)
data.dropna(inplace=True)

In [0]:
X = data.drop('event', axis=1)
y = data.event

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y)

In [11]:
X_train.shape

(325284, 578)

In [12]:
X_train_OS, y_train_OS = RandomOverSampler().fit_resample(X_train, y_train)
del X_train
del y_train
X_train_OS.shape

(665024, 578)

In [26]:
%%time
ns = [500, 800, 1000, 1200, 1500]

for n in ns:
    model1 = CatBoostClassifier(iterations=n, verbose=False, task_type='GPU', devices='0')
    model2 = CatBoostClassifier(iterations=n, verbose=False, task_type='GPU', devices='0')
    model3 = CatBoostClassifier(iterations=n, verbose=False, task_type='GPU', devices='0')
    model1.fit(X_train_OS, (y_train_OS > 1).astype(float))
    print("1 fit successful")
    model2.fit(X_train_OS, (y_train_OS > 2).astype(float))
    print("2 fit successful")
    model3.fit(X_train_OS, (y_train_OS > 3).astype(float))
    print("3 fit successful")
    print("Binary scores:", accuracy_score(model1.predict(X_test), y_test > 1), accuracy_score(model2.predict(X_test), y_test > 2), accuracy_score(model3.predict(X_test), y_test > 3))
    print("Multiclass score:", accuracy_score(pred(X_test), class_value(y_test > 1, y_test > 2, y_test > 3)))
    ps = np.array([1 - model1.predict_proba(X_test)[:, 1], model1.predict_proba(X_test)[:, 1] - model2.predict_proba(X_test)[:, 1],
                  model2.predict_proba(X_test)[:, 1] - model3.predict_proba(X_test)[:, 1], model3.predict_proba(X_test)[:, 1]])
    ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Tanh answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Signum answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    del model1
    del model2
    del model3
    del ps
    del ans

1 fit successful
2 fit successful
3 fit successful
Binary scores: 0.9718338436566201 0.647157560777659 0.8847161249861659
Multiclass score: 0.5606762090972811
Tanh answer: 0.27231987744725333
Signum answer: 0.30464188865894853
1 fit successful
2 fit successful
3 fit successful
Binary scores: 0.9742686390969122 0.6513907846681669 0.8866067805363929
Multiclass score: 0.5664772936879773
Tanh answer: 0.2661589961244913
Signum answer: 0.29881001754711495
1 fit successful
2 fit successful
3 fit successful
Binary scores: 0.9748588925369831 0.6519441472682332 0.8872062566864648
Multiclass score: 0.5671136606780537
Tanh answer: 0.27622541644610804
Signum answer: 0.3062306587977261
1 fit successful
2 fit successful
3 fit successful
Binary scores: 0.9767772162172133 0.6540745932784889 0.8876212786365145
Multiclass score: 0.5702678274984322
Tanh answer: 0.27323600435382595
Signum answer: 0.3041314404215881
1 fit successful
2 fit successful
3 fit successful
Binary scores: 0.9768233297672188 0.65572

In [0]:
%%time
ns = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for n in ns:
    print(f"n={n}")
    del model1
    del model2
    del model3
    del ps
    del ans
    model1 = RandomForestClassifier(n)
    model2 = RandomForestClassifier(n)
    model3 = RandomForestClassifier(n)
    model1.fit(X_train_OS, y_train_OS > 1)
    model2.fit(X_train_OS, y_train_OS > 2)
    model3.fit(X_train_OS, y_train_OS > 3)
    print("Binary scores:", accuracy_score(model1.predict(X_test), y_test > 1), accuracy_score(model2.predict(X_test), y_test > 2), accuracy_score(model3.predict(X_test), y_test > 3))
    print("Multiclass score:", accuracy_score(pred(X_test), class_value(y_test > 1, y_test > 2, y_test > 3)))
    ps = np.array([1 - model1.predict_proba(X_test)[:, 1], model1.predict_proba(X_test)[:, 1] - model2.predict_proba(X_test)[:, 1],
                   model2.predict_proba(X_test)[:, 1] - model3.predict_proba(X_test)[:, 1], model3.predict_proba(X_test)[:, 1]])
    ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Tanh answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / len(ans))
    ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Signum answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / len(ans))
    
# WRONG METRIC CALCULATION, IT HAS TO BE DIVIDED BY SUM OF ABSOLUTE WEIGHTS
# also, i forgot there is a way to use several cores

n=10
Binary scores: 0.9866362932083964 0.6201350204744162 0.8878795145165456
Multiclass score: 0.5363743682443649
Tanh answer: 0.045762697314815345
Signum answer: 0.04037610211383775
n=20
Binary scores: 0.9871250968384551 0.6395857158667503 0.8929888958571587
Multiclass score: 0.5520898660862508
Tanh answer: 0.04959326223046025
Signum answer: 0.05694746744384292
n=30
Binary scores: 0.9872634374884716 0.6471298926476556 0.8939019441472682
Multiclass score: 0.5576327147969159
Tanh answer: 0.05037911662885898
Signum answer: 0.06974489984138167
n=40
Binary scores: 0.9872265466484672 0.6504224001180506 0.8943446342273214
Multiclass score: 0.5602058508872247
Tanh answer: 0.052244641252820585
Signum answer: 0.08057051684068255
n=50
Binary scores: 0.9872726601984727 0.6534013354484082 0.8948611059873833
Multiclass score: 0.5625760873575091
Tanh answer: 0.05264631449014688
Signum answer: 0.07889106134947783
n=60
Binary scores: 0.9872726601984727 0.6546925148485631 0.8948979968273878
Multiclass 

In [0]:
%%time
ns = [10, 20, 40, 60, 90, 120]

for n in ns:
    print(f"n={n}")
    model1 = RandomForestClassifier(n, n_jobs=-1)
    model2 = RandomForestClassifier(n, n_jobs=-1)
    model3 = RandomForestClassifier(n, n_jobs=-1)
    model1.fit(X_train_OS, y_train_OS > 1)
    model2.fit(X_train_OS, y_train_OS > 2)
    model3.fit(X_train_OS, y_train_OS > 3)
    print("Binary scores:", accuracy_score(model1.predict(X_test), y_test > 1), accuracy_score(model2.predict(X_test), y_test > 2), accuracy_score(model3.predict(X_test), y_test > 3))
    print("Multiclass score:", accuracy_score(pred(X_test), class_value(y_test > 1, y_test > 2, y_test > 3)))
    ps = np.array([1 - model1.predict_proba(X_test)[:, 1], model1.predict_proba(X_test)[:, 1] - model2.predict_proba(X_test)[:, 1],
                   model2.predict_proba(X_test)[:, 1] - model3.predict_proba(X_test)[:, 1], model3.predict_proba(X_test)[:, 1]])
    ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Tanh answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Signum answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    del model1
    del model2
    del model3
    del ps
    del ans

In [0]:
%%time
ns = [150]

for n in ns:
    print(f"n={n}")
    model1 = RandomForestClassifier(n, n_jobs=-1)
    model2 = RandomForestClassifier(n, n_jobs=-1)
    model3 = RandomForestClassifier(n, n_jobs=-1)
    model1.fit(X_train_OS, y_train_OS > 1)
    model2.fit(X_train_OS, y_train_OS > 2)
    model3.fit(X_train_OS, y_train_OS > 3)
    print("Binary scores:", accuracy_score(model1.predict(X_test), y_test > 1), accuracy_score(model2.predict(X_test), y_test > 2), accuracy_score(model3.predict(X_test), y_test > 3))
    print("Multiclass score:", accuracy_score(pred(X_test), class_value(y_test > 1, y_test > 2, y_test > 3)))
    ps = np.array([1 - model1.predict_proba(X_test)[:, 1], model1.predict_proba(X_test)[:, 1] - model2.predict_proba(X_test)[:, 1],
                   model2.predict_proba(X_test)[:, 1] - model3.predict_proba(X_test)[:, 1], model3.predict_proba(X_test)[:, 1]])
    ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Tanh answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
    print("Signum answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
    del model1
    del model2
    del model3
    del ps
    del ans

n=150
Binary scores: 0.9873925554284871 0.6598018961891762 0.8948518832773822
Multiclass score: 0.5672888921680747
Tanh answer: 0.18308748881585615
Signum answer: 0.32087414260704306
n=300


KeyboardInterrupt: ignored

In [27]:
n = 1500
model1 = CatBoostClassifier(iterations=n, task_type='GPU', devices='0')
model2 = CatBoostClassifier(iterations=n, task_type='GPU', devices='0')
model3 = CatBoostClassifier(iterations=n, task_type='GPU', devices='0')
model1.fit(X_train_OS, (y_train_OS > 1).astype(float))
print("1 fit successful")
model2.fit(X_train_OS, (y_train_OS > 2).astype(float))
print("2 fit successful")
model3.fit(X_train_OS, (y_train_OS > 3).astype(float))
print("3 fit successful")
print("Binary scores:", accuracy_score(model1.predict(X_test), y_test > 1), accuracy_score(model2.predict(X_test), y_test > 2), accuracy_score(model3.predict(X_test), y_test > 3))
print("Multiclass score:", accuracy_score(pred(X_test), class_value(y_test > 1, y_test > 2, y_test > 3)))
ps = np.array([1 - model1.predict_proba(X_test)[:, 1], model1.predict_proba(X_test)[:, 1] - model2.predict_proba(X_test)[:, 1],
              model2.predict_proba(X_test)[:, 1] - model3.predict_proba(X_test)[:, 1], model3.predict_proba(X_test)[:, 1]])
ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
print("Tanh answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))
ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
print("Signum answer:", sum([weight(event) * answer for event, answer in zip(y_test, ans)]) / sum([abs(weight(event)) for event in y_test]))

Learning rate set to 0.073467
0:	learn: 0.6658964	total: 84.7ms	remaining: 2m 6s
1:	learn: 0.6440331	total: 179ms	remaining: 2m 14s
2:	learn: 0.6234848	total: 278ms	remaining: 2m 18s
3:	learn: 0.6069639	total: 368ms	remaining: 2m 17s
4:	learn: 0.5923875	total: 460ms	remaining: 2m 17s
5:	learn: 0.5799548	total: 555ms	remaining: 2m 18s
6:	learn: 0.5691078	total: 658ms	remaining: 2m 20s
7:	learn: 0.5602103	total: 770ms	remaining: 2m 23s
8:	learn: 0.5516151	total: 887ms	remaining: 2m 27s
9:	learn: 0.5446513	total: 985ms	remaining: 2m 26s
10:	learn: 0.5386450	total: 1.1s	remaining: 2m 29s
11:	learn: 0.5332410	total: 1.2s	remaining: 2m 29s
12:	learn: 0.5266192	total: 1.31s	remaining: 2m 29s
13:	learn: 0.5227307	total: 1.41s	remaining: 2m 30s
14:	learn: 0.5186031	total: 1.53s	remaining: 2m 31s
15:	learn: 0.5156439	total: 1.64s	remaining: 2m 31s
16:	learn: 0.5127059	total: 1.71s	remaining: 2m 29s
17:	learn: 0.5101964	total: 1.8s	remaining: 2m 28s
18:	learn: 0.5072887	total: 1.9s	remaining: 2m 

In [0]:
del data
del X
del y
del X_train_OS
del y_train_OS

In [29]:
model1

<catboost.core.CatBoostClassifier at 0x7f09719eda58>

In [30]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
link = 'https://drive.google.com/open?id=1h9zZM-wclbjIDWLIvSYtMNpot7rMDPUR'
fluff, id = link.split('=')
print (id)
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test_dataset.csv')  
data = pd.read_csv('test_dataset.csv')
# Dataset is now stored in a Pandas Dataframe

1h9zZM-wclbjIDWLIvSYtMNpot7rMDPUR


In [31]:
data.head()

Unnamed: 0.1,Unnamed: 0,answer_id,age,full_years,gender,product_0_CLS,product_0_NaN,product_0_UTL,product_1_CLS,product_1_NaN,product_1_OPN,product_1_UTL,product_2_CLS,product_2_NaN,product_2_OPN,product_2_UTL,product_3_CLS,product_3_NaN,product_3_UTL,product_4_CLS,product_4_NaN,product_4_OPN,product_4_UTL,product_5_CLS,product_5_NaN,product_5_OPN,product_5_UTL,product_6_CLS,product_6_NaN,product_6_OPN,product_6_UTL,marital_status_cd_CIV,marital_status_cd_DIV,marital_status_cd_DLW,marital_status_cd_MAR,marital_status_cd_NaN,marital_status_cd_UNM,marital_status_cd_WID,job_position_cd_1,job_position_cd_2,...,coord260,coord261,coord262,coord263,coord264,coord265,coord266,coord267,coord268,coord269,coord270,coord271,coord272,coord273,coord274,coord275,coord276,coord277,coord278,coord279,coord280,coord281,coord282,coord283,coord284,coord285,coord286,coord287,coord288,coord289,coord290,coord291,coord292,coord293,coord294,coord295,coord296,coord297,coord298,coord299
0,0,0,25.0,2.0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,...,0.005123,0.024197,-0.058282,0.007024,-0.039734,0.029438,-0.006146,-0.028294,-0.040146,0.00715,0.000429,0.029343,0.008134,0.03695,-0.009188,0.056968,0.052076,0.015773,-0.007375,0.00701,-0.049727,-0.02625,-0.01147,-0.02862,-0.002693,0.006334,-0.004347,-0.054856,0.011038,0.037431,0.00924,0.020367,0.003448,0.011735,-0.025896,-0.040107,0.023583,0.030698,-0.022525,0.002432
1,1,1,25.0,2.0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,...,-0.024783,0.021015,-0.050751,0.013025,-0.030274,0.019388,0.007604,-0.033712,-0.026759,-0.028097,0.025944,0.028333,0.019599,0.064329,-0.011714,0.03699,0.026414,0.002427,-0.00324,-0.023047,-0.037153,-0.009166,-0.018715,-0.013719,-0.003098,0.011552,-0.008348,-0.035734,-0.013084,-0.004529,0.010601,-0.000924,-0.015405,0.02268,-0.031062,-0.022008,0.02989,0.006448,-0.010427,0.010316
2,2,2,25.0,1.0,1,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,...,-0.015661,0.038601,-0.066301,-0.000626,-0.02559,0.023087,0.005542,-0.025718,-0.001172,0.020753,0.01315,0.041343,0.013823,0.01725,-0.014985,0.005705,0.015268,0.015882,0.044541,0.022796,-0.051697,-0.030849,-0.019309,-0.00138,-0.005866,0.000532,0.003278,0.022488,0.01998,0.024087,-0.01466,0.020797,-0.008639,0.005238,-0.005248,-0.01452,0.047162,0.001721,-0.0488,0.012543
3,3,3,30.0,1.0,0,0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,...,-0.022959,0.017009,-0.062764,-0.014338,-0.033903,0.016967,-0.020229,-0.011473,-0.001042,-0.012925,0.022262,0.028878,-0.000129,0.028645,0.023044,0.019966,-0.003169,0.023608,0.029418,0.006427,-0.018562,0.001513,0.007357,-0.006433,0.029866,-0.006083,-0.013533,0.009892,-0.012969,0.035268,-0.000867,0.003601,-0.003817,-0.015421,-0.032705,-0.01762,0.037112,0.005482,-0.051731,-0.008525
4,4,4,30.0,7.0,1,0,0,1,0,1,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,...,-0.008125,0.004922,-0.048499,0.020274,-0.039889,0.051245,-0.007227,-0.037216,-0.029321,-0.008985,0.007017,0.021847,0.021325,0.036199,-0.006801,0.05692,0.052238,0.018618,-0.015878,0.009901,-0.037796,-0.025908,-0.004817,-0.031321,0.000944,-0.002224,-0.021467,-0.027055,-0.00452,0.04099,0.008177,0.017992,-0.002303,0.007081,-0.033907,-0.047662,0.026817,0.030919,-0.018454,0.025909


In [0]:
data.drop('Unnamed: 0', axis=1, inplace=True)
answer_ids = data.answer_id
X = data.drop('answer_id', axis=1)

In [33]:
data.dropna().shape

(169838, 579)

In [34]:
data.fillna(data.mean(), inplace=True)
data.dropna().shape

(172049, 579)

In [0]:
answer_ids = data.answer_id
X = data.drop('answer_id', axis=1)

In [0]:
ps = np.array([1 - model1.predict_proba(X)[:, 1], model1.predict_proba(X)[:, 1] - model2.predict_proba(X)[:, 1],
               model2.predict_proba(X)[:, 1] - model3.predict_proba(X)[:, 1], model3.predict_proba(X)[:, 1]])
th_ans = np.tanh(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])
sgn_ans = np.sign(-10 * ps[0] - 0.1 * ps[1] + 0.1 * ps[2] + 0.5 * ps[3])

In [0]:
answer_df = pd.DataFrame({'tanh': th_ans, 'sgn': sgn_ans}, index=answer_ids)

In [38]:
answer_df

Unnamed: 0_level_0,tanh,sgn
answer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.203695,-1.0
1,-0.111093,-1.0
2,-0.840543,-1.0
3,-0.085432,-1.0
4,-0.123665,-1.0
5,0.043137,1.0
6,-0.001735,-1.0
7,-0.157245,-1.0
8,-0.166007,-1.0
9,-0.246131,-1.0


In [39]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at drive


In [0]:
answer_df.to_csv('answer_CB_1500iters.csv')
!cp answer_CB_1500iters.csv drive/My\ Drive/