In [21]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from xgboost import XGBClassifier



## Process Training Data

In [2]:
def checkthisvb(vb):
    spm = pd.DataFrame(origdata.groupby(vb)['is_risk'].mean())
    cnt = pd.DataFrame(origdata.groupby(vb)['is_risk'].count())
    # cityspm.reset_index(inplace = True)
    spm = pd.merge(spm, cnt, left_index=True, right_index=True)
    spm = spm.rename(columns = {"is_risk_x": "riskprob", "is_risk_y":"count"})
    spm = spm.sort_values(by = "is_risk", ascending=False)
    return spm
def changehour(x):
    if x['hour'] >= 3 and x['hour'] <= 5:
        return 3
    elif x['hour'] == 2 or x['hour'] == 6:
        return 2
    else: 
        return 1

In [17]:
def trainprocess(data):
    data['hour'] = data.apply(changehour, axis = 1)
    dv = pd.DataFrame({"dev":data.groupby("device")['is_risk'].mean()})
    data = data.merge(dv, left_on="device", right_index=True, how = "left")
    data.drop("device", 1, inplace=True)
    # data['dev'] = data.apply(lambda x : 1 if x["dev"] > 0.7 else 0, axis = 1)

    iddt = pd.DataFrame({"idp":data.groupby("id")['is_risk'].mean()})
    data = data.merge(iddt, left_on="id", right_index=True, how = "left")
    data.drop("id", 1, inplace=True)

    city = pd.DataFrame({"ct":data.groupby("city")['is_risk'].mean()})
    data = data.merge(city, left_on="city", right_index=True, how = "left")
    data.drop("city", 1, inplace=True)
    data['ct'] = data.apply(lambda x : 1 if x["ct"] > 0.07 else 0, axis = 1)

    ip = pd.DataFrame({"ipa":data.groupby("ip")['is_risk'].mean()})
    data = data.merge(ip, left_on="ip", right_index=True, how = "left")
    # data.drop("ip", 1, inplace=True)
    data['ipa'] = data.apply(lambda x : 1 if x["ipa"] > 0 else 0, axis = 1)

    log = pd.DataFrame({"log":data.groupby("log_from")['is_risk'].mean()})
    data = data.merge(log, left_on="log_from", right_index=True, how = "left")
    # data.drop("log_from", 1, inplace=True)
    data['log'] = data.apply(lambda x : 1 if x["log"] > 0 else 0, axis = 1)
    res = pd.DataFrame({"res":data.groupby("result")['is_risk'].mean()})
    data = data.merge(res, left_on="result", right_index=True, how = "left")
    columns_one_hot = ['type', 'result','weekdays',"log_from"]
    for col in columns_one_hot:      
        data = data.join(pd.get_dummies(data[col], prefix=col))
    data.drop(columns_one_hot, axis=1, inplace=True)
    return data


In [18]:
traindata = pd.read_csv("Dec10/train11.csv", index_col=0)
traindata = trainprocess(traindata)
traindata.to_csv('train_aft.csv', sep=',', encoding='utf-8')

## Process Testing Data

In [13]:
def testprocess(data, origdata):
    data['hour'] = data.apply(changehour, axis = 1)

    dv = pd.DataFrame({"dev":origdata.groupby("device")['is_risk'].mean()})
    data = data.merge(dv, left_on="device", right_index=True, how = "left")
    data.drop("device", 1, inplace=True)
    # data['dev'] = data.apply(lambda x : 1 if x["dev"] > 0.7 else 0, axis = 1)

    iddt = pd.DataFrame({"idp":origdata.groupby("id")['is_risk'].mean()})
    data = data.merge(iddt, left_on="id", right_index=True, how = "left")
    data.drop("id", 1, inplace=True)

    city = pd.DataFrame({"ct":origdata.groupby("city")['is_risk'].mean()})
    data = data.merge(city, left_on="city", right_index=True, how = "left")
    data.drop("city", 1, inplace=True)
    data['ct'] = data.apply(lambda x : 1 if x["ct"] > 0.07 else 0, axis = 1)

    ip = pd.DataFrame({"ipa":origdata.groupby("ip")['is_risk'].mean()})
    data = data.merge(ip, left_on="ip", right_index=True, how = "left")
    # data.drop("ip", 1, inplace=True)
    data['ipa'] = data.apply(lambda x : 1 if x["ipa"] > 0 else 0, axis = 1)

    log = pd.DataFrame({"log":origdata.groupby("log_from")['is_risk'].mean()})
    data = data.merge(log, left_on="log_from", right_index=True, how = "left")
    # data.drop("log_from", 1, inplace=True)
    data['log'] = data.apply(lambda x : 1 if x["log"] > 0 else 0, axis = 1)

    # data.weekdays = data.weekdays.replace([1,0,2,4,5,6,3], [0,0,0,1,0,0,0])
    res = pd.DataFrame({"res":origdata.groupby("result")['is_risk'].mean()})
    data = data.merge(res, left_on="result", right_index=True, how = "left")
    columns_one_hot = ['type', 'result','weekdays', "log_from"]
    for col in columns_one_hot:      
        data = data.join(pd.get_dummies(data[col], prefix=col))
    data.drop(columns_one_hot, axis=1, inplace=True)
    return data

In [15]:
testdata = pd.read_csv("Dec10/test11.csv", index_col=0)
origdata = pd.read_csv("Dec10/train11.csv", index_col=0)
testdata = testprocess(testdata, origdata)
testdata.to_csv('test_aft.csv', sep=',', encoding='utf-8')

## Modelling

In [27]:
def model(train, test):
    X_train = train.drop(["is_risk"], 1)
    Y_train = train.is_risk
    filt1 = train.dev > 0
    filt2 = train.idp > 0
    filt3 = train.ct > 0
    filt4 = train.ipa > 0
    test = test.fillna({"dev":np.mean(train[filt1].dev),"idp":np.mean(train[filt2].idp),"ct":np.mean(train[filt3].ct)})
    X_test = test
#         Y_test = test.is_risk
    cols = X_train.columns.tolist()
    cols2 = X_test.columns.tolist()
    X_train = X_train[cols2]
    X_train['time_pd_x']=[pd.Timestamp(X_train.loc[t,'time_pd_x']) for t in X_train.index]
    X_train['time_pd_y']=[pd.Timestamp(X_train.loc[t,'time_pd_y']) for t in X_train.index]
    X_test['time_pd_x']=[pd.Timestamp(X_test.loc[t,'time_pd_x']) for t in X_test.index]
    X_test['time_pd_y']=[pd.Timestamp(X_test.loc[t,'time_pd_y']) for t in X_test.index]
    def caltimedif(data):
        return (data['time_pd_x'] - data['time_pd_y']).days 
    X_train['dif'] = X_train.apply(caltimedif, axis = 1)
    X_test['dif'] = X_test.apply(caltimedif, axis = 1)
    X_train = X_train.drop(["time_dif","time_pd_x","time_pd_y"], 1)
    X_test = X_test.drop(["time_dif","time_pd_x","time_pd_y"], 1)
    xg = XGBClassifier(learning_rate =0.11,
     n_estimators=150,
     max_depth=10,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=27)
    xg.fit(X_train, Y_train)
    y_test_hat_xg = xg.predict_proba(X_test)[:,1]
    y_test_hat_xg_cl = xg.predict(X_test)
    y_train_hat_xg = xg.predict_proba(X_train)[:,1]
    # print(fbeta_score(Y_train, y_train_hat_xg, beta=0.1))
    # print(fbeta_score(Y_test, y_test_hat_xg_cl, beta=0.1))
    Y_test_hat_xg_pr = xg.predict_proba(X_test)
    test['prxg'] = Y_test_hat_xg_pr[:,1]
    test.to_csv("Dec10/test1xg.csv")
    return test


In [28]:
train = pd.read_csv("train_aft.csv",index_col=0)
test = pd.read_csv("test_aft.csv",index_col=0)
result = model(train, test)

In [29]:
result.prxg

rowkey
24622     0.000031
39036     0.000042
62674     0.000041
163053    0.000054
50616     0.000045
68355     0.000244
23239     0.000004
94869     0.000003
79351     0.007103
81268     0.139674
172474    0.000029
55344     0.000463
111852    0.000429
111911    0.000429
1124      0.000011
61060     0.000011
83879     0.000044
109683    0.000004
38326     0.000059
43939     0.000029
69235     0.000029
125353    0.000030
54412     0.000333
154863    0.000004
101241    0.000635
66271     0.000051
166319    0.000043
118449    0.000404
119214    0.000404
136995    0.000176
            ...   
78404     0.000008
164721    0.000004
82822     0.000069
82864     0.000069
108438    0.000219
148460    0.000422
155564    0.000542
4236      0.000006
58202     0.000007
9028      0.000337
62456     0.000126
42239     0.000020
85560     0.000004
91511     0.000005
145621    0.000005
46113     0.000006
60326     0.000005
128092    0.000029
128215    0.000029
153848    0.000028
22946     0.000007
13835