In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import re
import lightgbm as lgb
import datetime as dt
import catboost as ctb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import xgboost as xgb
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [2]:
clear = ["Opportunity_Name","ID","Last_Activity", "Brand", "Product_Type", "ASP_converted_Currency", 
          "Prod_Category_A", "Product_Category_B", "ASP_converted_Currency", "Product_Name", 
         "Delivery_Year", "Month", "TRF", "Submitted_for_Approval", "Account_Type", "Delivery_Terms", "Size", 
         "Price", "ASP_Currency", "Total_Amount_Currency", "Total_Taxable_Amount_Currency","Quote_Type", "Opportunity_Type",
         "Product_Family", "Account_Name"]
dates = ["Account_Created_Date", "Opportunity_Created_Date", "Quote_Expiry_Date", "Last_Modified_Date", 
         "Planned_Delivery_Start_Date", "Planned_Delivery_End_Date", "Last_Activity", "Actual_Delivery_Date"]
target = ["Opportunity_ID", "Stage", "Sales_Contract_No"]

In [3]:
data = pd.read_csv("Entrenamieto_ECI_2020.csv")

In [4]:
def preprocess(data, ada = False):
    
    # elimino caracteres prohibidos en los headers
    
    data = data.rename(columns = lambda x:re.sub("[^A-Za-z0-9_]+", "", x))
        
    for d in dates:
        data[d] = pd.to_datetime(data[d])

    # agrego features
    
    data["Contacts"] = data.groupby("Opportunity_ID", sort = False)["Opportunity_ID"].transform("count")
    data["Delivery_Difference"] = (data["Planned_Delivery_End_Date"] - data["Planned_Delivery_Start_Date"]).dt.days
    data["Same_Owner"] = (data.Opportunity_Owner == data.Account_Owner) & (data.Opportunity_Owner == data.Last_Modified_By)
    data["Has_Brand"] = data.Brand != "None"
    data["Has_Contract"] = data.Sales_Contract_No != "None"
    data["Different_Country"] = (data.Billing_Country != data.Territory) & (data.Territory != "None")    
    data.loc[data.TRF == 0, "TRF_Cat"] = 0
    data.loc[(1 <= data.TRF) & (data.TRF <= 7), "TRF_Cat"] = 1
    data.loc[data.TRF > 7, "TRF_Cat"] = 2
    data["Sales"] = data.groupby("Account_Name", sort = False)["Account_Name"].transform("count")
    
    data["Concrete_Offer"] = (data["Planned_Delivery_End_Date"] - data["Opportunity_Created_Date"]).dt.days
    
    data["Offer_Duration"] = (data["Quote_Expiry_Date"] - data["Opportunity_Created_Date"]).dt.days
    
    # casteo a categoricas varias columnas
    categorical = [x for x in data.columns if data[x].dtype == "object"]
    for c in categorical:       
        data[c] = data[c].astype('category')
        
    # limpio columnas
    
    data = data.drop(clear + dates, axis = 1)
    
    if "Stage" in data:
        data = data[(data.Stage == "Closed Won") | (data.Stage == "Closed Lost")]
        data.Stage = data.Stage.replace({"Closed Won": 1, "Closed Lost": 0})
    
    # numeric encode de categoricals
    
    if ada:    
        label_encoder = LabelEncoder()
        cat_vars = [x for x in data.select_dtypes("category").columns if x != "Stage"]
        for col in cat_vars:
            data[col] = label_encoder.fit_transform(data[col])

        # elimino nans. strategy --> mean
        imputer = SimpleImputer(strategy = "mean")
        cols = data.columns
        data = imputer.fit_transform(data)
        data = pd.DataFrame(data, columns = cols)

    return (data)

In [5]:
df = preprocess(data)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df, df.Stage, test_size = 0.3, random_state = 0, stratify = df.Stage)

In [7]:
categorical = [x for x in df.drop(target, axis = 1).select_dtypes('category').columns if x != 'Stage'] + ["TRF_Cat"]
#categorical = [x for x in cat_vars if x in df.columns]
train_data = lgb.Dataset(data = x_train.drop(target, axis = 1), label = x_train.Stage, categorical_feature = categorical)
test_data =  lgb.Dataset(data = x_test.drop(target, axis = 1), label = x_test.Stage)

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': True,
    'boosting': 'gbdt',
    'num_leaves': 30,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'max_depth': 7,
    'learning_rate': 0.015,
    'max_bin': 300,
    'verbose': 0
}

model1 = lgb.train(parameters, train_data, valid_sets = test_data, num_boost_round = 5000, early_stopping_rounds = 500)

New categorical_feature is ['Account_Owner', 'Billing_Country', 'Bureaucratic_Code', 'Currency', 'Delivery_Quarter', 'Last_Modified_By', 'Opportunity_Owner', 'Region', 'Source', 'TRF_Cat', 'Territory']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[1]	valid_0's binary_logloss: 0.671856
Training until validation scores don't improve for 500 rounds
[2]	valid_0's binary_logloss: 0.65931
[3]	valid_0's binary_logloss: 0.652116
[4]	valid_0's binary_logloss: 0.640123
[5]	valid_0's binary_logloss: 0.633375
[6]	valid_0's binary_logloss: 0.626865
[7]	valid_0's binary_logloss: 0.615711
[8]	valid_0's binary_logloss: 0.609701
[9]	valid_0's binary_logloss: 0.603663
[10]	valid_0's binary_logloss: 0.593042
[11]	valid_0's binary_logloss: 0.588571
[12]	valid_0's binary_logloss: 0.58282
[13]	valid_0's binary_logloss: 0.572935
[14]	valid_0's binary_logloss: 0.563203
[15]	valid_0's binary_logloss: 0.553586
[16]	valid_0's binary_logloss: 0.544417
[17]	valid_0's binary_logloss: 0.535537
[18]	valid_0's binary_logloss: 0.530831
[19]	valid_0's binary_logloss: 0.52636
[20]	valid_0's binary_logloss: 0.521711
[21]	valid_0's binary_logloss: 0.513271
[22]	valid_0's binary_logloss: 0.505056
[23]	valid_0's binary_logloss: 0.500727
[24]	valid_0's binary_logloss:

[219]	valid_0's binary_logloss: 0.135374
[220]	valid_0's binary_logloss: 0.134582
[221]	valid_0's binary_logloss: 0.134389
[222]	valid_0's binary_logloss: 0.134149
[223]	valid_0's binary_logloss: 0.133336
[224]	valid_0's binary_logloss: 0.132545
[225]	valid_0's binary_logloss: 0.131785
[226]	valid_0's binary_logloss: 0.131029
[227]	valid_0's binary_logloss: 0.130249
[228]	valid_0's binary_logloss: 0.129574
[229]	valid_0's binary_logloss: 0.129336
[230]	valid_0's binary_logloss: 0.129106
[231]	valid_0's binary_logloss: 0.128935
[232]	valid_0's binary_logloss: 0.128747
[233]	valid_0's binary_logloss: 0.128585
[234]	valid_0's binary_logloss: 0.128373
[235]	valid_0's binary_logloss: 0.127732
[236]	valid_0's binary_logloss: 0.127055
[237]	valid_0's binary_logloss: 0.126878
[238]	valid_0's binary_logloss: 0.126152
[239]	valid_0's binary_logloss: 0.125487
[240]	valid_0's binary_logloss: 0.124799
[241]	valid_0's binary_logloss: 0.124169
[242]	valid_0's binary_logloss: 0.124018
[243]	valid_0's 

[429]	valid_0's binary_logloss: 0.0864002
[430]	valid_0's binary_logloss: 0.0862325
[431]	valid_0's binary_logloss: 0.0861808
[432]	valid_0's binary_logloss: 0.0861683
[433]	valid_0's binary_logloss: 0.0860111
[434]	valid_0's binary_logloss: 0.0860022
[435]	valid_0's binary_logloss: 0.085947
[436]	valid_0's binary_logloss: 0.0858487
[437]	valid_0's binary_logloss: 0.0858087
[438]	valid_0's binary_logloss: 0.0857445
[439]	valid_0's binary_logloss: 0.0855636
[440]	valid_0's binary_logloss: 0.0855107
[441]	valid_0's binary_logloss: 0.0854526
[442]	valid_0's binary_logloss: 0.0853413
[443]	valid_0's binary_logloss: 0.0852515
[444]	valid_0's binary_logloss: 0.0851005
[445]	valid_0's binary_logloss: 0.08498
[446]	valid_0's binary_logloss: 0.0849182
[447]	valid_0's binary_logloss: 0.084818
[448]	valid_0's binary_logloss: 0.0847755
[449]	valid_0's binary_logloss: 0.0847385
[450]	valid_0's binary_logloss: 0.0846001
[451]	valid_0's binary_logloss: 0.0845733
[452]	valid_0's binary_logloss: 0.0844

[632]	valid_0's binary_logloss: 0.0750375
[633]	valid_0's binary_logloss: 0.0750235
[634]	valid_0's binary_logloss: 0.0749955
[635]	valid_0's binary_logloss: 0.0749782
[636]	valid_0's binary_logloss: 0.074936
[637]	valid_0's binary_logloss: 0.0749097
[638]	valid_0's binary_logloss: 0.0749012
[639]	valid_0's binary_logloss: 0.074886
[640]	valid_0's binary_logloss: 0.074876
[641]	valid_0's binary_logloss: 0.0748114
[642]	valid_0's binary_logloss: 0.0748147
[643]	valid_0's binary_logloss: 0.0747675
[644]	valid_0's binary_logloss: 0.0747209
[645]	valid_0's binary_logloss: 0.0746352
[646]	valid_0's binary_logloss: 0.074602
[647]	valid_0's binary_logloss: 0.074555
[648]	valid_0's binary_logloss: 0.074563
[649]	valid_0's binary_logloss: 0.0745603
[650]	valid_0's binary_logloss: 0.0745435
[651]	valid_0's binary_logloss: 0.0745008
[652]	valid_0's binary_logloss: 0.0745177
[653]	valid_0's binary_logloss: 0.0745009
[654]	valid_0's binary_logloss: 0.0744816
[655]	valid_0's binary_logloss: 0.074464

[829]	valid_0's binary_logloss: 0.0711957
[830]	valid_0's binary_logloss: 0.0712044
[831]	valid_0's binary_logloss: 0.0711882
[832]	valid_0's binary_logloss: 0.0711653
[833]	valid_0's binary_logloss: 0.0711483
[834]	valid_0's binary_logloss: 0.0711569
[835]	valid_0's binary_logloss: 0.0711656
[836]	valid_0's binary_logloss: 0.0711796
[837]	valid_0's binary_logloss: 0.0711432
[838]	valid_0's binary_logloss: 0.0711415
[839]	valid_0's binary_logloss: 0.0711356
[840]	valid_0's binary_logloss: 0.0711295
[841]	valid_0's binary_logloss: 0.0711156
[842]	valid_0's binary_logloss: 0.0710641
[843]	valid_0's binary_logloss: 0.0710367
[844]	valid_0's binary_logloss: 0.0710406
[845]	valid_0's binary_logloss: 0.0710422
[846]	valid_0's binary_logloss: 0.0710459
[847]	valid_0's binary_logloss: 0.0710492
[848]	valid_0's binary_logloss: 0.0710276
[849]	valid_0's binary_logloss: 0.0710081
[850]	valid_0's binary_logloss: 0.071035
[851]	valid_0's binary_logloss: 0.071
[852]	valid_0's binary_logloss: 0.07097

[1032]	valid_0's binary_logloss: 0.0690781
[1033]	valid_0's binary_logloss: 0.0690332
[1034]	valid_0's binary_logloss: 0.068983
[1035]	valid_0's binary_logloss: 0.0689561
[1036]	valid_0's binary_logloss: 0.0689399
[1037]	valid_0's binary_logloss: 0.0689235
[1038]	valid_0's binary_logloss: 0.0689245
[1039]	valid_0's binary_logloss: 0.0689063
[1040]	valid_0's binary_logloss: 0.068862
[1041]	valid_0's binary_logloss: 0.0688518
[1042]	valid_0's binary_logloss: 0.0688243
[1043]	valid_0's binary_logloss: 0.0688098
[1044]	valid_0's binary_logloss: 0.0687915
[1045]	valid_0's binary_logloss: 0.068803
[1046]	valid_0's binary_logloss: 0.0688089
[1047]	valid_0's binary_logloss: 0.068823
[1048]	valid_0's binary_logloss: 0.0688306
[1049]	valid_0's binary_logloss: 0.0688264
[1050]	valid_0's binary_logloss: 0.0688117
[1051]	valid_0's binary_logloss: 0.0687975
[1052]	valid_0's binary_logloss: 0.0688031
[1053]	valid_0's binary_logloss: 0.0688181
[1054]	valid_0's binary_logloss: 0.0688291
[1055]	valid_0'

[1237]	valid_0's binary_logloss: 0.0694502
[1238]	valid_0's binary_logloss: 0.069468
[1239]	valid_0's binary_logloss: 0.0694894
[1240]	valid_0's binary_logloss: 0.0694991
[1241]	valid_0's binary_logloss: 0.0694615
[1242]	valid_0's binary_logloss: 0.069454
[1243]	valid_0's binary_logloss: 0.0694558
[1244]	valid_0's binary_logloss: 0.0694721
[1245]	valid_0's binary_logloss: 0.0694215
[1246]	valid_0's binary_logloss: 0.0694246
[1247]	valid_0's binary_logloss: 0.0694223
[1248]	valid_0's binary_logloss: 0.0694033
[1249]	valid_0's binary_logloss: 0.0694467
[1250]	valid_0's binary_logloss: 0.0694688
[1251]	valid_0's binary_logloss: 0.0694788
[1252]	valid_0's binary_logloss: 0.0694672
[1253]	valid_0's binary_logloss: 0.0694619
[1254]	valid_0's binary_logloss: 0.0694932
[1255]	valid_0's binary_logloss: 0.0694948
[1256]	valid_0's binary_logloss: 0.0695274
[1257]	valid_0's binary_logloss: 0.0695495
[1258]	valid_0's binary_logloss: 0.0695671
[1259]	valid_0's binary_logloss: 0.0695501
[1260]	valid_

[1432]	valid_0's binary_logloss: 0.0718219
[1433]	valid_0's binary_logloss: 0.0718531
[1434]	valid_0's binary_logloss: 0.071873
[1435]	valid_0's binary_logloss: 0.0718754
[1436]	valid_0's binary_logloss: 0.0719063
[1437]	valid_0's binary_logloss: 0.0719358
[1438]	valid_0's binary_logloss: 0.0719531
[1439]	valid_0's binary_logloss: 0.0719939
[1440]	valid_0's binary_logloss: 0.0720306
[1441]	valid_0's binary_logloss: 0.0720012
[1442]	valid_0's binary_logloss: 0.0720095
[1443]	valid_0's binary_logloss: 0.0719696
[1444]	valid_0's binary_logloss: 0.0719696
[1445]	valid_0's binary_logloss: 0.071973
[1446]	valid_0's binary_logloss: 0.0720044
[1447]	valid_0's binary_logloss: 0.0719637
[1448]	valid_0's binary_logloss: 0.0719469
[1449]	valid_0's binary_logloss: 0.0719324
[1450]	valid_0's binary_logloss: 0.0719085
[1451]	valid_0's binary_logloss: 0.0718864
[1452]	valid_0's binary_logloss: 0.07188
[1453]	valid_0's binary_logloss: 0.0718902
[1454]	valid_0's binary_logloss: 0.0718922
[1455]	valid_0'

In [8]:
model0 = lgb.LGBMClassifier(n_estimators = 1000,
                            learning_rate = 0.015,
                            colsample_bytree = 0.5,
                            objective = "binary",
                            max_depth = 6, 
                            eval_metric = 'Logloss',
                            random_state = 42)

In [9]:
model0.fit(x_train.drop(target, axis = 1), y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.5,
        eval_metric='Logloss', importance_type='split',
        learning_rate=0.015, max_depth=6, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=42,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [12]:
y_pred = model0.predict_proba(x_test.drop(target, axis = 1))[:,1]

In [13]:
log_loss(y_test, y_pred)

0.07104022543659584

In [14]:
validation_file = "Validacion_ECI_2020.csv"
vali = pd.read_csv(validation_file)
validation = preprocess(vali)
leak = ["Opportunity_ID", "Sales_Contract_No"]
pred = model1.predict(validation.drop(leak, axis = 1))

pred = pd.DataFrame(pred, index = validation.index, columns = ["Prediction"])
val_booster = validation.join(pred)

answer = pd.DataFrame(val_booster.groupby("Opportunity_ID", as_index = False)["Prediction"].mean())

In [21]:
prev = pd.read_csv("acceptable/submission_11.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(answer["Prediction"])

0.9933923480363895

In [22]:
prev = pd.read_csv("acceptable/submission_23.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(answer["Prediction"])

0.9931464106713234

In [23]:
prev = pd.read_csv("submisson_ensamble_6.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(answer["Prediction"])

0.9944970207744297

In [24]:
answer.to_csv("lgbclass_intento.csv", index = False, header = False)

In [20]:
pred = model0.predict_proba(validation.drop(leak, axis = 1))[:,1]

pred = pd.DataFrame(pred, index = validation.index, columns = ["Prediction"])
val_classi = validation.join(pred)

answer = pd.DataFrame(val_classi.groupby("Opportunity_ID", as_index = False)["Prediction"].mean())