In [74]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
seed = 42
import math

import seaborn as sns # statistical data visualization

In [75]:
train = pd.read_csv("../datasets/xgb-train.csv")
test = pd.read_csv("../datasets/xgb-test.csv")

In [76]:
import re
train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test = test.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [77]:
target = "target"
features = list(train.columns)
features.remove(target)
features.remove("Opportunity_ID")

#VANILLA NO
# features.remove("delivery_delay")
# features.remove("opportunity_lifetime")
# features.remove("converted_taxable_amount")
# features.remove("last_modified_to_delivery")
# features.remove("currency_conversion_rate")
# features.remove("Occur")
# features.remove("delivery_window")
# features.remove("account_creation_to_created_opp")

#VANILLA SI
features.remove('Total_Taxable_Amount')
#features.remove('ASP_(converted)')
features.remove("ASP")
features.remove("Total_Amount")
features.remove("Delivery_Year")
# features.remove("Week_Day")

#CONSIDERO REMOVIBLES
# features.remove("created_blocknum")   #FECHA!!
features.remove("late_delivery_blocknum")
features.remove("early_delivery_blocknum")
features.remove("last_modified_blocknum")
features.remove("account_creation_blocknum") #OJO CCON HIPOTESIS DEL CLIENTE VIEEJO
print("Features totales: {}".format(len(features)))

Features totales: 116


In [78]:
X_train, y_train = train.loc[:,features], train.loc[:,target]
X_test_Opp = test.loc[:,"Opportunity_ID"]
X_test = test.loc[:,features]

print("Set de entrenamiento (size) {}".format(X_train.shape))
print("Set de testing (size) {}".format(X_test.shape))

Set de entrenamiento (size) (16947, 116)
Set de testing (size) (2551, 116)


In [79]:
percent_80 = train["created_blocknum"].describe()['75%']/0.945
beta_test = train[train["created_blocknum"]>percent_80]
beta_train = train[train["created_blocknum"]<percent_80]

In [80]:
A_train, b_train = beta_train.loc[:,features], beta_train.loc[:,target]
# A_test_Opp = beta_test.loc[:,"Opportunity_ID"]
A_test, b_test = beta_test.loc[:,features], beta_test.loc[:,target]

print("Set de entrenamiento secundario (size) {}".format(A_train.shape))
print("Set de testing secundario (size) {}".format(A_test.shape))

Set de entrenamiento secundario (size) (14078, 116)
Set de testing secundario (size) (2869, 116)


In [81]:
lgb_train = lgb.Dataset(X_train, y_train)

In [82]:
#------------------------------------------------------------------------------------------
#----------------------------- C O N F I G U R A C I O N ----------------------------------
#------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------

nfolds = 7
rounds = 160
early_stopping_rounds = 10

params = {"objective": "binary",
          'colsample_bytree': 0.3,
          'subsample': 1.0,
          'learning_rate': 0.1,
          'max_depth': 6,
          'min_child_weight': 1,
          'metric': "binary_logloss"
          }
beta_params = params.copy()
beta_params['n_estimators'] = 100


enaable_gridsearch_for_tree = True
enaable_gridsearch_for_sampling = True
enaable_gridsearch_for_learning = True
final_cv = True
enable_parcial_training = True

print("Usando nfolds={},  num_boost_round={}"
      "  y  early_stopping_rounds={}".format(nfolds, rounds, early_stopping_rounds))

#------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------
#------------------------------------------------------------------------------------------

Usando nfolds=7,  num_boost_round=160  y  early_stopping_rounds=10


In [83]:
#FUNCIONES

def find_best_param_tuple(lgb_train, params, param_tuple):
    # Define initial best params and LogLoss
    min_logloss = float("Inf")
    best_params = None
    for param0, param1 in param_tuple['grid']:
        print("CV with {}={}, {}={}".format(param_tuple['names'][0],
                                 param0,
                                 param_tuple['names'][1],
                                 param1))
        # Update our parameters
        params[param_tuple['names'][0]] = param0
        if param_tuple['names'][1]: params[param_tuple['names'][1]] = param1
        # Run CV
        cv_results = lgb.cv(
            params,
            lgb_train,
            num_boost_round=rounds,
            seed=1234,
            nfold=nfolds,
            metrics='logloss',
            early_stopping_rounds=early_stopping_rounds
        )
        # Update best LogLoss
        mean_logloss = cv_results['test-logloss-mean'].min()
        boost_rounds = cv_results['test-logloss-mean'].argmin()
        print("\tLL {} for {} rounds".format(mean_logloss, boost_rounds+1))
        if mean_logloss < min_logloss:
            min_logloss = mean_logloss
            best_params = (param0, param1)
    print("Best {},{}: {}, {}, LogLoss: {}".format(param_tuple['names'][0],param_tuple['names'][1],best_params[0], best_params[1], min_logloss))
    return best_params

Tuneo de: 
max_depth
min_child_weight

In [84]:
if enaable_gridsearch_for_tree:

    tree_params_tuple = {
        'names': ('max_depth', 'min_child_weight'),
        'grid': [
            (max_depth, min_child_weight)
            for max_depth in range(20,26,1)
            for min_child_weight in range(1,2)
        ]
    }

    best_tree_params = find_best_param_tuple(lgb_train, params, tree_params_tuple)

    params['max_depth'] = best_tree_params[0]
    params['min_child_weight'] = best_tree_params[1]

CV with max_depth=20, min_child_weight=1
[LightGBM] [Info] Number of positive: 8172, number of negative: 6354
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 14526, number of used features: 112
[LightGBM] [Info] Number of positive: 8171, number of negative: 6355
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 14526, number of used features: 112
[LightGBM] [Info] Number of positive: 8171, number of negative: 6355
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2383
[LightGBM] [Info] Number of data points in the train set: 14526, number of used features: 112
[L

ValueError: For early stopping, at least one dataset and eval metric is required for evaluation

Best max_depth,min_child_weight: 23, 1, LogLoss: 0.15631257142857144

In [None]:
if enaable_gridsearch_for_sampling:

    sample_params_tuple = {
        'names': ('subsample', 'colsample_bytree'),
        'grid': [
            (subsample, colsample_bytree)
            for subsample in [i/10. for i in range(8,11)]
            for colsample_bytree in  [i/10. for i in range(3,5)]
        ]
    }

    best_sample_params = find_best_param_tuple(lgb_train, params, sample_params_tuple)

    params['subsample'] = best_sample_params[0]
    params['colsample_bytree'] = best_sample_params[1]

In [None]:
X_valid, y_valid = train.loc[:,features], train.loc[:,target]


In [None]:
gridParams = { 
    'learning_rate': [0.005],
    'n_estimators': [40],
    'num_leaves': [16,32, 64], 
    'random_state' : [501],
    'num_boost_round' : [3000],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4], 
    }

lgb_estimator = lgb.LGBMClassifier(boosting_type = 'gbdt', 
                                   objective = 'binary',
                                   n_estimators=175, 
                                   learning_rate =  0.1,
                                   num_leaves =  31,
                                   subsample = 1.0,
                                   colsample_bytree = 0.3,
                                   eval_metric  = 'logloss',
                                   early_stopping_rounds=10)

#g_lgbm = GridSearchCV(estimator=lgb_estimator, param_grid=gridParams, n_jobs = 3, cv= 3)

lgb_model = lgb_estimator.fit(X=X_train, y=y_train, eval_set = (X_valid, y_valid))


In [90]:
ID_submit_new = test["Opportunity_ID"]
pred = lgb_model.predict_proba(X_test)
pred_df = pd.DataFrame(pred, columns=["Other","Target"])
pred_df["Opportunity_ID"] = ID_submit_new
pred_df.drop(columns="Other",inplace=True)
pred_df = pred_df.drop_duplicates("Opportunity_ID")
pred_df[["Opportunity_ID","Target"]].to_csv("../submits/lgm_tunned_classifier.csv",index=False)
pred_df[["Opportunity_ID","Target"]]

Unnamed: 0,Opportunity_ID,Target
0,10689,0.929982
3,10690,0.697446
8,10691,0.428577
9,10692,0.281747
15,10693,0.967341
...,...,...
2545,12364,0.985580
2547,12365,0.055137
2548,12366,0.027312
2549,12367,0.763994


In [86]:
y_true = pd.read_csv("../datasets/Y_TRUE.csv")
A = y_true["Target"]
A

0       0.994320
1       0.990209
2       0.669885
3       0.829021
4       0.998438
          ...   
1562    0.997522
1563    0.932449
1564    0.914969
1565    0.978395
1566    0.852016
Name: Target, Length: 1567, dtype: float64

In [87]:
B = pred_df["Target"].astype('float').values
B

array([0.92998159, 0.8973892 , 0.93294381, ..., 0.0273117 , 0.76399373,
       0.01899432])

In [None]:
from sklearn.metrics import log_loss
log_loss(round(A),B)

In [None]:
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))

In [None]:
y_pred_train = clf.predict(X_train)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))

In [None]:
print('Training set score: {:.4f}'.format(clf.score(X_train, y_train)))

print('Test set score: {:.4f}'.format(clf.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': 8,
    'num_leaves': 100,
    'learning_rate': 0.1,
    'verbose': 0, 
    'early_stopping_round': 1000}
n_estimators=99999999

In [None]:
d_train = lgb.Dataset(X_train.values, label=y_train.values)
d_valid = lgb.Dataset(X_val.values, label=y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=500)

In [None]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(y_val.values)
mean_absolute_error(Y_val,Y_pred)

In [None]:

test_predict = reg.predict(X_test)
f = np.vectorize(math.exp)
test_predict = f(test_predict)
test_predict
#escribir_respuesta(ids, test_predict)