In [6]:
import lightgbm as lgbm
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder


def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

In [28]:
cv_Only = True
save_cv  = True
full_train = False

In [29]:

def evalerror(preds, dtrain):
    labels = dtrain.get_labels()
    return 'gini', Gini(labels,)


In [30]:

train = pd.read_csv('./porto-seguro-safe-driver-prediction/train.csv' , na_values=-1)
test = pd.read_csv('./porto-seguro-safe-driver-prediction/test.csv' , na_values=-1)


In [31]:
train_label = train['target']
trian_id = train['id']

test_id = test['id']

In [32]:
NFOLDS = 5

kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

In [33]:
y=train['target'].values

drop_feature= [
    'id',
    'target'
]

In [34]:
X = train.drop(drop_feature, axis = 1)

feature_names=  X.columns.tolist()



In [35]:
# catc는 의미가 없다고 판단되어서 제거한다

In [36]:
cat_features  = [ c for c  in feature_names if ('cat' in c and 'count' not in c)]
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]


# 원래 카테고리에서의 의미를 잃어버리지 않기 위해서 string으로 단순히 더한 feature를 만드는 것이다

train['missing'] = (train ==-1).sum(axis=1).astype(float)
test['missing'] = (test ==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [37]:
cat_features


['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

In [38]:
le = LabelEncoder()

In [39]:

train

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,missing
0,7,0,2,2.0,5,1.0,0.0,0,1,0,...,1,5,8,0,1,1,0,0,1,0.0
1,9,0,1,1.0,7,0.0,0.0,0,0,1,...,1,1,9,0,1,1,0,1,0,0.0
2,13,0,5,4.0,9,1.0,0.0,0,0,1,...,2,7,7,0,1,1,0,1,0,0.0
3,16,0,0,1.0,2,0.0,0.0,1,0,0,...,2,4,9,0,0,0,0,0,0,0.0
4,17,0,0,2.0,0,1.0,0.0,1,0,0,...,1,1,3,0,0,0,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,1488013,0,3,1.0,10,0.0,0.0,0,0,0,...,1,9,6,0,1,1,0,1,1,0.0
595208,1488016,0,5,1.0,3,0.0,0.0,0,0,0,...,1,3,8,1,0,1,0,1,1,0.0
595209,1488017,0,1,1.0,10,0.0,0.0,1,0,0,...,2,2,6,0,0,1,0,0,0,0.0
595210,1488021,0,5,2.0,3,1.0,0.0,0,0,1,...,1,4,2,0,1,1,1,0,0,0.0


In [40]:

# label 의 unique count 만큼 encoding을 해준다
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

In [41]:

train

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,missing
0,7,0,2,1,5,1,0,0,1,0,...,1,5,8,0,1,1,0,0,1,0.0
1,9,0,1,0,7,0,0,0,0,1,...,1,1,9,0,1,1,0,1,0,0.0
2,13,0,5,3,9,1,0,0,0,1,...,2,7,7,0,1,1,0,1,0,0.0
3,16,0,0,0,2,0,0,1,0,0,...,2,4,9,0,0,0,0,0,0,0.0
4,17,0,0,1,0,1,0,1,0,0,...,1,1,3,0,0,0,1,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,1488013,0,3,0,10,0,0,0,0,0,...,1,9,6,0,1,1,0,1,1,0.0
595208,1488016,0,5,0,3,0,0,0,0,0,...,1,3,8,1,0,1,0,1,1,0.0
595209,1488017,0,1,0,10,0,0,1,0,0,...,2,2,6,0,0,1,0,0,0,0.0
595210,1488021,0,5,1,3,1,0,0,0,1,...,1,4,2,0,1,1,1,0,0,0.0


In [42]:
pd.concat([(train==-1).sum(axis=1), train['target']],axis=1).groupby(0).mean()

Unnamed: 0_level_0,target
0,Unnamed: 1_level_1
0,0.036448


In [43]:
le = LabelEncoder()
le.fit(train[c])

LabelEncoder()

In [44]:
enc = OneHotEncoder()
enc.fit(train[cat_features])
X_cat = enc.transform(train[cat_features])
X_t_cat =  enc.transform(test[cat_features])


In [46]:
ind_features = [c for c in feature_names if 'ind' in c ]

count=0

In [47]:
for c  in ind_features:
    if count ==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [48]:
cat_count_features = []

In [49]:
train['new_ind'].value_counts().shape

(93282,)

In [50]:
cat_count_features = []

for c in cat_features + ['new_ind']:
    d =pd.concat([train[c], test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

In [51]:
train_list = train[num_features + cat_count_features]

In [52]:
# to sparse matrix hstack 하기위하여 trainlist를 이와같은 형태로 만든다 compress sparse row -> coordianate format -> csr
train_list  = [train[num_features + cat_count_features].values ,X_cat,]
test_list = [test[num_features + cat_count_features].values, X_t_cat,]

In [53]:
train[num_features + cat_count_features]

Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_car_03_cat_count,ps_car_04_cat_count,ps_car_05_cat_count,ps_car_06_cat_count,ps_car_07_cat_count,ps_car_08_cat_count,ps_car_09_cat_count,ps_car_10_cat_count,ps_car_11_cat_count,new_ind_count
0,2,5,0,1,0,0,0,0,0,0,...,1028142,1241334,431560,77845,1383070,249663,486510,1475460,18326,6
1,1,7,0,0,1,0,0,0,0,0,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,12535,36
2,5,9,0,0,1,0,0,0,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,19943,24
3,0,2,1,0,0,0,0,0,0,0,...,183044,1241334,431560,329890,1383070,1238365,36798,1475460,212989,2784
4,0,0,1,0,0,0,0,0,0,0,...,1028142,1241334,666910,147714,1383070,1238365,883326,1475460,26161,258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595207,3,10,0,0,0,1,0,0,0,0,...,1028142,1241334,431560,295574,1383070,1238365,486510,1475460,13143,117
595208,5,3,0,0,0,1,0,0,0,0,...,1028142,1241334,666910,54151,76138,249663,883326,1475460,2722,153
595209,1,10,1,0,0,0,0,0,0,0,...,1028142,1241334,666910,295574,1383070,1238365,883326,1475460,13143,382
595210,5,3,0,0,1,0,0,0,0,0,...,1028142,1241334,666910,329890,1383070,1238365,883326,1475460,18416,65


In [54]:
X_cat

<595212x184 sparse matrix of type '<class 'numpy.float64'>'
	with 8332968 stored elements in Compressed Sparse Row format>

In [55]:
train_list

[array([[2.00000e+00, 5.00000e+00, 0.00000e+00, ..., 1.47546e+06,
         1.83260e+04, 6.00000e+00],
        [1.00000e+00, 7.00000e+00, 0.00000e+00, ..., 1.47546e+06,
         1.25350e+04, 3.60000e+01],
        [5.00000e+00, 9.00000e+00, 0.00000e+00, ..., 1.47546e+06,
         1.99430e+04, 2.40000e+01],
        ...,
        [1.00000e+00, 1.00000e+01, 1.00000e+00, ..., 1.47546e+06,
         1.31430e+04, 3.82000e+02],
        [5.00000e+00, 3.00000e+00, 0.00000e+00, ..., 1.47546e+06,
         1.84160e+04, 6.50000e+01],
        [0.00000e+00, 8.00000e+00, 1.00000e+00, ..., 1.47546e+06,
         1.17740e+04, 6.99000e+02]]),
 <595212x184 sparse matrix of type '<class 'numpy.float64'>'
 	with 8332968 stored elements in Compressed Sparse Row format>]

In [56]:
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

In [57]:
# ind feature들을 하나로 묶어서 새로운 카테고리를 생성
# 각 카테고리가 몇개씩 나오는 지로 frequency encoding

train[c].apply(lambda x: d.get(x,0))

0            6
1           36
2           24
3         2784
4          258
          ... 
595207     117
595208     153
595209     382
595210      65
595211     699
Name: new_ind, Length: 595212, dtype: int64

In [58]:
#Model development


learning_rate = 0.1
num_leaves = 15
min_data_ind_leaf = 2000
feature_fraction = 0.6
num_boost_round = 10000


params = {'objective' : 'binary' ,
            'boosting_type' : 'gbdt',
             'learning_rate' : learning_rate,
                'num_leavers' : num_leaves,
                'mix_bin' : 256,
                'feature_fraction' : feature_fraction,
                'verbosity':0,
                'drop_rate' : 0.1,
                'is_unbalance' : False,
                'max_drop' : 50,
                'min_child_samples' :10,
                'min_split_gain':0,
                'subsample' :0.9

          }

In [59]:


np.zeros(len(train_label))

array([0., 0., 0., ..., 0., 0., 0.])

In [60]:
x_score =  []
final_cv_train =  np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))


In [61]:
for s in np.arange(16):
    break

In [62]:
cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))

In [63]:
#return generator indices
kf = kfold.split(X, train_label)

In [64]:
best_trees  =[]
fold_scores =[]

In [65]:
for i, (train_fold, validate) in enumerate(kf):
    break

In [66]:
X_train , label_train =  X[train_fold,:], train_label[train_fold]
X_validate , label_validate = X[validate, :], train_label[validate]

NameError: name 'lgbm' is not defined

In [None]:
#%5
dtrain = lgbm.Dataset(X_train, label_train)
dvalid = lgbm.Dataset(X_validate, label_validate,reference= dtrain)


bst = lgbm.train(params,dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror,
           verbose_eval=100, early_stopping_rounds =100)

In [4]:
params['seed'] = s

NameError: name 's' is not defined

In [2]:
bst.best_iteration


NameError: name 'bst' is not defined

In [4]:
best_trees.append(bst.best_iteration)

cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

cv_train[validate] += bst.predict(X_validate)

NameError: name 'best_trees' is not defined

In [5]:
score = Gini(label_validate, cv_train[validate])

NameError: name 'Gini' is not defined

In [None]:
train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

In [None]:
score

In [7]:
cv_only = True
save_cv = True
full_train = False

NameError: name 'xrange' is not defined

In [None]:
# cross validation stacking

In [None]:
#

final_cv_pred = np.zeros(len(test_id))
for s in range(16):
    # cross validation temp
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    params['seed'] = s

    if cv_only:
        #return train idx, validate idx
        kf = kfold.split(X, train_label)

        best_trees = []
        fold_scores = []

        for i, (train_fold, validate) in enumerate(kf):
            X_train, X_validate, label_train, label_validate = \
                X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]
            dtrain = lgbm.Dataset(X_train, label_train)
            dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
            bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                            early_stopping_rounds=100)
            best_trees.append(bst.best_iteration)

            cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)

            cv_train[validate] += bst.predict(X_validate)
            # 실제 Validate 와 학습한  bst 를 가지고 validate 를 예측한 값을 지니계수로 score구하

            score = Gini(label_validate, cv_train[validate])
            print(score)
            fold_scores.append(score)


        # test값에 대한 nfold 값으로 나눠서 평균 구한다
        cv_pred /= NFOLDS
        final_cv_train += cv_train
        # 5개의 fold모델 마다의 최종 cv_pred 더하기
        final_cv_pred += cv_pred

        print("cv score:")
        print Gini(train_label, cv_train)
        # 모델 train step 만큼 (1 step에 5번 )
        print "current score:", Gini(train_label, final_cv_train / (s + 1.)), s+1
        print(fold_scores)
        print(best_trees, np.mean(best_trees))

        x_score.append(Gini(train_label, cv_train))

print(x_score)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / 16.}).to_csv('../model/lgbm3_pred_avg.csv', index=False)
pd.DataFrame({'id': train_id, 'target': final_cv_train / 16.}).to_csv('../model/lgbm3_cv_avg.csv', index=False)

In [8]:
number_dict= {}

In [9]:
number_dict["111"] = 3

In [13]:
for i in number_dict.keys():
    print(i)

111
