In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import re
import lightgbm as lgb
import datetime as dt
import catboost as ctb
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [2]:
clear = ["Opportunity_Name","ID","Last_Activity", "Brand", "Product_Type", "ASP_converted_Currency", 
          "Prod_Category_A", "Product_Category_B", "ASP_converted_Currency", "Product_Name", 
         "Delivery_Year", "Month", "TRF", "Submitted_for_Approval", "Account_Type", "Delivery_Terms", "Size", 
         "Price", "ASP_Currency", "Total_Amount_Currency", "Total_Taxable_Amount_Currency","Quote_Type", "Opportunity_Type",
         "Product_Family", "Account_Name"]
dates = ["Account_Created_Date", "Opportunity_Created_Date", "Quote_Expiry_Date", "Last_Modified_Date", 
         "Planned_Delivery_Start_Date", "Planned_Delivery_End_Date", "Last_Activity", "Actual_Delivery_Date"]
target = ["Opportunity_ID", "Stage", "Sales_Contract_No"]

In [3]:
data = pd.read_csv("Entrenamieto_ECI_2020.csv")

In [4]:
def preprocess(data, ada = False):
    
    # elimino caracteres prohibidos en los headers
    
    data = data.rename(columns = lambda x:re.sub("[^A-Za-z0-9_]+", "", x))
        
    for d in dates:
        data[d] = pd.to_datetime(data[d])

    # agrego features
    
    data["Contacts"] = data.groupby("Opportunity_ID", sort = False)["Opportunity_ID"].transform("count")
    data["Delivery_Difference"] = (data["Planned_Delivery_End_Date"] - data["Planned_Delivery_Start_Date"]).dt.days
    data["Same_Owner"] = (data.Opportunity_Owner == data.Account_Owner) & (data.Opportunity_Owner == data.Last_Modified_By)
    data["Has_Brand"] = data.Brand != "None"
    data["Has_Contract"] = data.Sales_Contract_No != "None"
    data["Different_Country"] = (data.Billing_Country != data.Territory) & (data.Territory != "None")    
    data.loc[data.TRF == 0, "TRF_Cat"] = 0
    data.loc[(1 <= data.TRF) & (data.TRF <= 7), "TRF_Cat"] = 1
    data.loc[data.TRF > 7, "TRF_Cat"] = 2
    data["Sales"] = data.groupby("Account_Name", sort = False)["Account_Name"].transform("count")
    
    data["Concrete_Offer"] = (data["Planned_Delivery_End_Date"] - data["Opportunity_Created_Date"]).dt.days
    
    data["Offer_Duration"] = (data["Quote_Expiry_Date"] - data["Opportunity_Created_Date"]).dt.days
    
    # fabri
    
    data["Territory_Defined"] = data.Territory != "None"
    data["Past_Quote"] = (data["Last_Modified_Date"] - data["Quote_Expiry_Date"]).dt.days
    
    # casteo a categoricas varias columnas
    categorical = [x for x in data.columns if data[x].dtype == "object"]
    for c in categorical:       
        data[c] = data[c].astype('category')
        
    # limpio columnas
    
    data = data.drop(clear + dates, axis = 1)
    
    if "Stage" in data:
        data = data[(data.Stage == "Closed Won") | (data.Stage == "Closed Lost")]
        data.Stage = data.Stage.replace({"Closed Won": 1, "Closed Lost": 0})
    
    # numeric encode de categoricals
    
    if ada:    
        label_encoder = LabelEncoder()
        cat_vars = [x for x in data.select_dtypes("category").columns if x != "Stage"]
        for col in cat_vars:
            data[col] = label_encoder.fit_transform(data[col])

        # elimino nans. strategy --> mean
        imputer = SimpleImputer(strategy = "mean")
        cols = data.columns
        data = imputer.fit_transform(data)
        data = pd.DataFrame(data, columns = cols)

    return (data)

In [5]:
df = preprocess(data)

In [11]:
df_contract = df[df.Has_Contract]
df_nocontract = df[~df.Has_Contract]

In [13]:
df_contract.Stage.value_counts()

1    9395
0     579
Name: Stage, dtype: int64

In [14]:
df_nocontract.Stage.value_counts()

0    6771
1     138
Name: Stage, dtype: int64

In [38]:
x_train, x_test, y_train, y_test = train_test_split(df_contract, df_contract.Stage, test_size = 0.3, random_state = 0)
categorical = [x for x in df.drop(target, axis = 1).select_dtypes('category').columns if x != 'Stage'] + ["TRF_Cat"]
#categorical = [x for x in cat_vars if x in df.columns]
train_data = lgb.Dataset(data = x_train.drop(target, axis = 1), label = x_train.Stage, categorical_feature = categorical)
test_data =  lgb.Dataset(data = x_test.drop(target, axis = 1), label = x_test.Stage)

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': True,
    'boosting': 'gbdt',
    'num_leaves': 30,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.6,
    'bagging_freq': 20,
    'learning_rate': 0.015,
    'max_bin': 300,
    'verbose': 0
}

model_contract = lgb.train(parameters, train_data, valid_sets = test_data, num_boost_round = 5000, early_stopping_rounds = 500)

[1]	valid_0's binary_logloss: 0.201806
Training until validation scores don't improve for 500 rounds
[2]	valid_0's binary_logloss: 0.19754
[3]	valid_0's binary_logloss: 0.193783
[4]	valid_0's binary_logloss: 0.191341
[5]	valid_0's binary_logloss: 0.189253
[6]	valid_0's binary_logloss: 0.187392
[7]	valid_0's binary_logloss: 0.185671
[8]	valid_0's binary_logloss: 0.184283
[9]	valid_0's binary_logloss: 0.18291
[10]	valid_0's binary_logloss: 0.18185
[11]	valid_0's binary_logloss: 0.180741
[12]	valid_0's binary_logloss: 0.179944
[13]	valid_0's binary_logloss: 0.179654
[14]	valid_0's binary_logloss: 0.17863
[15]	valid_0's binary_logloss: 0.177951
[16]	valid_0's binary_logloss: 0.177349
[17]	valid_0's binary_logloss: 0.176875
[18]	valid_0's binary_logloss: 0.176525
[19]	valid_0's binary_logloss: 0.175948
[20]	valid_0's binary_logloss: 0.175583
[21]	valid_0's binary_logloss: 0.175365
[22]	valid_0's binary_logloss: 0.175295
[23]	valid_0's binary_logloss: 0.175538
[24]	valid_0's binary_logloss: 

[230]	valid_0's binary_logloss: 0.152224
[231]	valid_0's binary_logloss: 0.151952
[232]	valid_0's binary_logloss: 0.151639
[233]	valid_0's binary_logloss: 0.151302
[234]	valid_0's binary_logloss: 0.150964
[235]	valid_0's binary_logloss: 0.15075
[236]	valid_0's binary_logloss: 0.150528
[237]	valid_0's binary_logloss: 0.150164
[238]	valid_0's binary_logloss: 0.149863
[239]	valid_0's binary_logloss: 0.149485
[240]	valid_0's binary_logloss: 0.149289
[241]	valid_0's binary_logloss: 0.148995
[242]	valid_0's binary_logloss: 0.148634
[243]	valid_0's binary_logloss: 0.148433
[244]	valid_0's binary_logloss: 0.148361
[245]	valid_0's binary_logloss: 0.14821
[246]	valid_0's binary_logloss: 0.148035
[247]	valid_0's binary_logloss: 0.147855
[248]	valid_0's binary_logloss: 0.147623
[249]	valid_0's binary_logloss: 0.147372
[250]	valid_0's binary_logloss: 0.14716
[251]	valid_0's binary_logloss: 0.146757
[252]	valid_0's binary_logloss: 0.146625
[253]	valid_0's binary_logloss: 0.146399
[254]	valid_0's bin

[443]	valid_0's binary_logloss: 0.107976
[444]	valid_0's binary_logloss: 0.107926
[445]	valid_0's binary_logloss: 0.107778
[446]	valid_0's binary_logloss: 0.107629
[447]	valid_0's binary_logloss: 0.107471
[448]	valid_0's binary_logloss: 0.107298
[449]	valid_0's binary_logloss: 0.107152
[450]	valid_0's binary_logloss: 0.106897
[451]	valid_0's binary_logloss: 0.106699
[452]	valid_0's binary_logloss: 0.106546
[453]	valid_0's binary_logloss: 0.106447
[454]	valid_0's binary_logloss: 0.106272
[455]	valid_0's binary_logloss: 0.106043
[456]	valid_0's binary_logloss: 0.105907
[457]	valid_0's binary_logloss: 0.105735
[458]	valid_0's binary_logloss: 0.105538
[459]	valid_0's binary_logloss: 0.105364
[460]	valid_0's binary_logloss: 0.105365
[461]	valid_0's binary_logloss: 0.105218
[462]	valid_0's binary_logloss: 0.104962
[463]	valid_0's binary_logloss: 0.104837
[464]	valid_0's binary_logloss: 0.104694
[465]	valid_0's binary_logloss: 0.104493
[466]	valid_0's binary_logloss: 0.104303
[467]	valid_0's 

[665]	valid_0's binary_logloss: 0.0884334
[666]	valid_0's binary_logloss: 0.0884241
[667]	valid_0's binary_logloss: 0.0882781
[668]	valid_0's binary_logloss: 0.0881999
[669]	valid_0's binary_logloss: 0.0881545
[670]	valid_0's binary_logloss: 0.0880742
[671]	valid_0's binary_logloss: 0.0879783
[672]	valid_0's binary_logloss: 0.0879598
[673]	valid_0's binary_logloss: 0.087909
[674]	valid_0's binary_logloss: 0.0878238
[675]	valid_0's binary_logloss: 0.0877454
[676]	valid_0's binary_logloss: 0.0877194
[677]	valid_0's binary_logloss: 0.0876925
[678]	valid_0's binary_logloss: 0.0876277
[679]	valid_0's binary_logloss: 0.0876345
[680]	valid_0's binary_logloss: 0.0876525
[681]	valid_0's binary_logloss: 0.0876074
[682]	valid_0's binary_logloss: 0.0875755
[683]	valid_0's binary_logloss: 0.0875612
[684]	valid_0's binary_logloss: 0.0874989
[685]	valid_0's binary_logloss: 0.08746
[686]	valid_0's binary_logloss: 0.0873972
[687]	valid_0's binary_logloss: 0.0873524
[688]	valid_0's binary_logloss: 0.087

[877]	valid_0's binary_logloss: 0.0818133
[878]	valid_0's binary_logloss: 0.081801
[879]	valid_0's binary_logloss: 0.0817617
[880]	valid_0's binary_logloss: 0.0817326
[881]	valid_0's binary_logloss: 0.0816779
[882]	valid_0's binary_logloss: 0.0816867
[883]	valid_0's binary_logloss: 0.081727
[884]	valid_0's binary_logloss: 0.0816903
[885]	valid_0's binary_logloss: 0.0816552
[886]	valid_0's binary_logloss: 0.0816677
[887]	valid_0's binary_logloss: 0.0816126
[888]	valid_0's binary_logloss: 0.0816297
[889]	valid_0's binary_logloss: 0.0816396
[890]	valid_0's binary_logloss: 0.0816053
[891]	valid_0's binary_logloss: 0.0816291
[892]	valid_0's binary_logloss: 0.0815799
[893]	valid_0's binary_logloss: 0.0815946
[894]	valid_0's binary_logloss: 0.0816013
[895]	valid_0's binary_logloss: 0.0815732
[896]	valid_0's binary_logloss: 0.0815895
[897]	valid_0's binary_logloss: 0.0815289
[898]	valid_0's binary_logloss: 0.0815409
[899]	valid_0's binary_logloss: 0.0815223
[900]	valid_0's binary_logloss: 0.08

[1089]	valid_0's binary_logloss: 0.0790914
[1090]	valid_0's binary_logloss: 0.0791072
[1091]	valid_0's binary_logloss: 0.0791056
[1092]	valid_0's binary_logloss: 0.079137
[1093]	valid_0's binary_logloss: 0.0791434
[1094]	valid_0's binary_logloss: 0.0791602
[1095]	valid_0's binary_logloss: 0.0791763
[1096]	valid_0's binary_logloss: 0.0792192
[1097]	valid_0's binary_logloss: 0.0792561
[1098]	valid_0's binary_logloss: 0.0792771
[1099]	valid_0's binary_logloss: 0.0793288
[1100]	valid_0's binary_logloss: 0.0793573
[1101]	valid_0's binary_logloss: 0.0792991
[1102]	valid_0's binary_logloss: 0.079278
[1103]	valid_0's binary_logloss: 0.079238
[1104]	valid_0's binary_logloss: 0.0792045
[1105]	valid_0's binary_logloss: 0.0791532
[1106]	valid_0's binary_logloss: 0.0791593
[1107]	valid_0's binary_logloss: 0.0791578
[1108]	valid_0's binary_logloss: 0.0791206
[1109]	valid_0's binary_logloss: 0.0791371
[1110]	valid_0's binary_logloss: 0.0791288
[1111]	valid_0's binary_logloss: 0.0791484
[1112]	valid_0

[1323]	valid_0's binary_logloss: 0.0806039
[1324]	valid_0's binary_logloss: 0.0806253
[1325]	valid_0's binary_logloss: 0.0806892
[1326]	valid_0's binary_logloss: 0.0806825
[1327]	valid_0's binary_logloss: 0.0806572
[1328]	valid_0's binary_logloss: 0.0807073
[1329]	valid_0's binary_logloss: 0.0807009
[1330]	valid_0's binary_logloss: 0.0807396
[1331]	valid_0's binary_logloss: 0.0807411
[1332]	valid_0's binary_logloss: 0.0807415
[1333]	valid_0's binary_logloss: 0.0807503
[1334]	valid_0's binary_logloss: 0.0807913
[1335]	valid_0's binary_logloss: 0.0807905
[1336]	valid_0's binary_logloss: 0.0807808
[1337]	valid_0's binary_logloss: 0.08082
[1338]	valid_0's binary_logloss: 0.0807984
[1339]	valid_0's binary_logloss: 0.0808071
[1340]	valid_0's binary_logloss: 0.0808144
[1341]	valid_0's binary_logloss: 0.0808076
[1342]	valid_0's binary_logloss: 0.0808087
[1343]	valid_0's binary_logloss: 0.0808066
[1344]	valid_0's binary_logloss: 0.0808566
[1345]	valid_0's binary_logloss: 0.0808908
[1346]	valid_

[1549]	valid_0's binary_logloss: 0.0831421
[1550]	valid_0's binary_logloss: 0.0831735
[1551]	valid_0's binary_logloss: 0.083194
[1552]	valid_0's binary_logloss: 0.0831816
[1553]	valid_0's binary_logloss: 0.0831664
[1554]	valid_0's binary_logloss: 0.0831756
[1555]	valid_0's binary_logloss: 0.08314
[1556]	valid_0's binary_logloss: 0.0831413
[1557]	valid_0's binary_logloss: 0.0831173
[1558]	valid_0's binary_logloss: 0.0831143
[1559]	valid_0's binary_logloss: 0.0831225
[1560]	valid_0's binary_logloss: 0.0831198
[1561]	valid_0's binary_logloss: 0.0831804
[1562]	valid_0's binary_logloss: 0.0831836
[1563]	valid_0's binary_logloss: 0.0832342
[1564]	valid_0's binary_logloss: 0.0832549
[1565]	valid_0's binary_logloss: 0.0832147
[1566]	valid_0's binary_logloss: 0.0832268
[1567]	valid_0's binary_logloss: 0.08325
[1568]	valid_0's binary_logloss: 0.0832649
[1569]	valid_0's binary_logloss: 0.0832526
[1570]	valid_0's binary_logloss: 0.08328
[1571]	valid_0's binary_logloss: 0.0833051
[1572]	valid_0's b

In [39]:
y_pred = model_contract.predict(x_test.drop(target, axis = 1))

In [40]:
log_loss(y_test, y_pred)

0.07893440480359541

In [41]:
pred = pd.DataFrame(y_pred, index = x_test.index, columns = ["Prediction"])
x_test_pred = x_test.join(pred)
x_test_pred.loc[ x_test_pred.Stage == 0, ["Opportunity_ID", "Stage", "Prediction"]]

Unnamed: 0,Opportunity_ID,Stage,Prediction
10156,6382,0,0.000386
15117,9516,0,0.980103
7952,4943,0,0.988310
13580,8270,0,0.990285
14410,8881,0,0.014032
15161,9565,0,0.260712
12182,7415,0,0.001734
9431,5965,0,0.949699
8090,5063,0,0.832246
15574,10019,0,0.871971


In [42]:
y_class = np.where(y_pred > 0.5, 1, 0)

In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_class)

array([[ 116,   42],
       [  18, 2817]])

In [45]:
x_trainnc, x_testnc, y_trainnc, y_testnc = train_test_split(df_nocontract, df_nocontract.Stage, test_size = 0.3, random_state = 0)
categorical = [x for x in df.drop(target, axis = 1).select_dtypes('category').columns if x != 'Stage'] + ["TRF_Cat"]
#categorical = [x for x in cat_vars if x in df.columns]
train_data = lgb.Dataset(data = x_trainnc.drop(target, axis = 1), label = x_trainnc.Stage, categorical_feature = categorical)
test_data =  lgb.Dataset(data = x_testnc.drop(target, axis = 1), label = x_testnc.Stage)

parameters = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'is_unbalance': True,
    'boosting': 'gbdt',
    'num_leaves': 30,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.6,
    'bagging_freq': 20,
    'learning_rate': 0.015,
    'max_bin': 300,
    'verbose': 0
}

model_nocontract = lgb.train(parameters, train_data, valid_sets = test_data, num_boost_round = 5000, early_stopping_rounds = 500)

[1]	valid_0's binary_logloss: 0.0887822
Training until validation scores don't improve for 500 rounds
[2]	valid_0's binary_logloss: 0.0852904
[3]	valid_0's binary_logloss: 0.0825863
[4]	valid_0's binary_logloss: 0.08091
[5]	valid_0's binary_logloss: 0.0790948
[6]	valid_0's binary_logloss: 0.0775597
[7]	valid_0's binary_logloss: 0.0762747
[8]	valid_0's binary_logloss: 0.0752846
[9]	valid_0's binary_logloss: 0.0743158
[10]	valid_0's binary_logloss: 0.0733107
[11]	valid_0's binary_logloss: 0.0725967
[12]	valid_0's binary_logloss: 0.0718512
[13]	valid_0's binary_logloss: 0.0712622
[14]	valid_0's binary_logloss: 0.0705728
[15]	valid_0's binary_logloss: 0.0700595
[16]	valid_0's binary_logloss: 0.0695992
[17]	valid_0's binary_logloss: 0.0690686
[18]	valid_0's binary_logloss: 0.0684786
[19]	valid_0's binary_logloss: 0.06784
[20]	valid_0's binary_logloss: 0.06747
[21]	valid_0's binary_logloss: 0.0664946
[22]	valid_0's binary_logloss: 0.0661046
[23]	valid_0's binary_logloss: 0.0653751
[24]	valid

[225]	valid_0's binary_logloss: 0.0339966
[226]	valid_0's binary_logloss: 0.0338904
[227]	valid_0's binary_logloss: 0.0338416
[228]	valid_0's binary_logloss: 0.0338405
[229]	valid_0's binary_logloss: 0.0337892
[230]	valid_0's binary_logloss: 0.0337312
[231]	valid_0's binary_logloss: 0.0336717
[232]	valid_0's binary_logloss: 0.0335932
[233]	valid_0's binary_logloss: 0.0334991
[234]	valid_0's binary_logloss: 0.0334156
[235]	valid_0's binary_logloss: 0.0333888
[236]	valid_0's binary_logloss: 0.0333149
[237]	valid_0's binary_logloss: 0.033237
[238]	valid_0's binary_logloss: 0.0332399
[239]	valid_0's binary_logloss: 0.0331882
[240]	valid_0's binary_logloss: 0.0331271
[241]	valid_0's binary_logloss: 0.0331321
[242]	valid_0's binary_logloss: 0.0330667
[243]	valid_0's binary_logloss: 0.0330489
[244]	valid_0's binary_logloss: 0.0330081
[245]	valid_0's binary_logloss: 0.0330444
[246]	valid_0's binary_logloss: 0.0330944
[247]	valid_0's binary_logloss: 0.0330818
[248]	valid_0's binary_logloss: 0.0

[430]	valid_0's binary_logloss: 0.0314196
[431]	valid_0's binary_logloss: 0.0313599
[432]	valid_0's binary_logloss: 0.0313494
[433]	valid_0's binary_logloss: 0.0313669
[434]	valid_0's binary_logloss: 0.0313901
[435]	valid_0's binary_logloss: 0.031422
[436]	valid_0's binary_logloss: 0.0314224
[437]	valid_0's binary_logloss: 0.0314493
[438]	valid_0's binary_logloss: 0.0314106
[439]	valid_0's binary_logloss: 0.0314438
[440]	valid_0's binary_logloss: 0.0314493
[441]	valid_0's binary_logloss: 0.0315266
[442]	valid_0's binary_logloss: 0.0315476
[443]	valid_0's binary_logloss: 0.0315516
[444]	valid_0's binary_logloss: 0.0315343
[445]	valid_0's binary_logloss: 0.0315922
[446]	valid_0's binary_logloss: 0.0316573
[447]	valid_0's binary_logloss: 0.0316823
[448]	valid_0's binary_logloss: 0.0317021
[449]	valid_0's binary_logloss: 0.031716
[450]	valid_0's binary_logloss: 0.031767
[451]	valid_0's binary_logloss: 0.0318009
[452]	valid_0's binary_logloss: 0.0318523
[453]	valid_0's binary_logloss: 0.031

[655]	valid_0's binary_logloss: 0.0365605
[656]	valid_0's binary_logloss: 0.0365217
[657]	valid_0's binary_logloss: 0.036536
[658]	valid_0's binary_logloss: 0.0365306
[659]	valid_0's binary_logloss: 0.03655
[660]	valid_0's binary_logloss: 0.0366078
[661]	valid_0's binary_logloss: 0.03664
[662]	valid_0's binary_logloss: 0.0367304
[663]	valid_0's binary_logloss: 0.0367395
[664]	valid_0's binary_logloss: 0.0367355
[665]	valid_0's binary_logloss: 0.0367224
[666]	valid_0's binary_logloss: 0.036761
[667]	valid_0's binary_logloss: 0.0367842
[668]	valid_0's binary_logloss: 0.0368378
[669]	valid_0's binary_logloss: 0.0368652
[670]	valid_0's binary_logloss: 0.0369349
[671]	valid_0's binary_logloss: 0.0369943
[672]	valid_0's binary_logloss: 0.0370174
[673]	valid_0's binary_logloss: 0.0369869
[674]	valid_0's binary_logloss: 0.0369817
[675]	valid_0's binary_logloss: 0.0370157
[676]	valid_0's binary_logloss: 0.0370571
[677]	valid_0's binary_logloss: 0.0370672
[678]	valid_0's binary_logloss: 0.037130

In [46]:
y_prednc = model_nocontract.predict(x_testnc.drop(target, axis = 1))

In [47]:
log_loss(y_testnc, y_prednc)

0.03085227410501338

In [48]:
pred = pd.DataFrame(y_prednc, index = x_testnc.index, columns = ["Prediction"])
x_test_prednc = x_testnc.join(pred)
x_test_prednc.loc[ x_test_prednc.Stage == 1, ["Opportunity_ID", "Stage", "Prediction"]]

Unnamed: 0,Opportunity_ID,Stage,Prediction
16112,12374,1,0.725612
9546,6045,1,0.571023
13629,8295,1,0.022065
16111,12374,1,0.7358
16136,12390,1,0.974714
16862,12770,1,0.97334
75,28,1,0.988355
12146,7394,1,0.032234
16624,12643,1,0.207132
16809,12732,1,0.990517


In [51]:
y_classnc = np.where(y_prednc > 0.5, 1, 0)

In [52]:
confusion_matrix(y_testnc, y_classnc)

array([[2032,    1],
       [  15,   25]])

In [54]:
x_test_pred = pd.concat([x_test_pred, x_test_prednc]) 

In [55]:
x_test_pred

Unnamed: 0,Region,Territory,PricingDelivery_Terms_Quote_Appr,PricingDelivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Bureaucratic_Code,Source,Billing_Country,Opportunity_ID,Sales_Contract_No,Account_Owner,Opportunity_Owner,Currency,Last_Modified_By,ASP,ASP_converted,Delivery_Quarter,Total_Amount,Total_Taxable_Amount,Stage,Contacts,Delivery_Difference,Same_Owner,Has_Brand,Has_Contract,Different_Country,TRF_Cat,Sales,Concrete_Offer,Offer_Duration,Territory_Defined,Past_Quote,Prediction
12573,APAC,Australia,0,0,0,0,Bureaucratic_Code_4,,Australia,7605,3900,Person_Name_43,Person_Name_43,,Person_Name_43,0.4000,0.40000,Q1,82600.00,1008000.00,1,12,29.0,True,False,True,False,0.0,231,57.0,,True,,0.990473
14758,Americas,NW America,0,0,0,0,Bureaucratic_Code_4,,United States,9174,4791,Person_Name_64,Person_Name_64,,Person_Name_33,0.4550,0.45500,Q3,23887.50,24887.50,1,2,11.0,False,False,True,True,0.0,8,32.0,18.0,True,19.0,0.967516
14814,Americas,NW America,0,0,0,0,Bureaucratic_Code_4,,United States,9223,4825,Person_Name_64,Person_Name_64,,Person_Name_64,0.5200,0.52000,Q3,21840.00,21840.01,1,2,0.0,True,False,True,True,0.0,18,31.0,10.0,True,-10.0,0.993084
2344,Japan,,1,1,1,1,Bureaucratic_Code_4,Source_3,Japan,1323,153,Person_Name_61,Person_Name_66,,Person_Name_47,66.0000,0.59337,Q1,12012000.00,12012000.00,1,1,3.0,False,False,True,False,0.0,10,25.0,,False,,0.999894
7510,APAC,Vietnam,0,0,0,0,Bureaucratic_Code_4,Source_9,Viet Nam,4628,3076,Person_Name_65,Person_Name_44,,Person_Name_47,0.4000,0.40000,Q3,5400.00,5400.00,1,1,7.0,False,False,True,True,0.0,15,25.0,18.0,True,131.0,0.868348
4749,APAC,China (PRC),1,1,1,1,Bureaucratic_Code_4,,China,2803,851,Person_Name_65,Person_Name_19,,Person_Name_47,0.1700,0.17000,Q4,2996.25,17523.60,1,5,74.0,False,False,True,True,0.0,215,74.0,74.0,True,-27.0,0.999377
4359,Japan,,1,1,0,0,Bureaucratic_Code_4,Source_13,Japan,2544,724,Person_Name_32,Person_Name_50,,Person_Name_47,0.0000,0.00000,Q3,1400.00,69700.00,1,5,0.0,False,False,True,False,0.0,586,5.0,2.0,False,72.0,0.999889
725,Americas,NE America,0,0,0,0,Bureaucratic_Code_4,Source_11,United States,342,1259,Person_Name_8,Person_Name_8,,Person_Name_47,0.4450,0.44500,Q1,11961.60,0.00,1,1,0.0,False,False,True,True,0.0,752,3.0,30.0,True,368.0,0.999878
10156,APAC,Australia,1,1,1,1,Bureaucratic_Code_4,Source_9,Australia,6382,3480,Person_Name_43,Person_Name_43,,Person_Name_43,0.4200,0.42000,Q1,85260.00,413700.00,0,5,4.0,True,False,True,False,0.0,397,108.0,,True,,0.000386
9261,EMEA,Spain,1,1,1,1,Bureaucratic_Code_4,Source_9,Spain,5815,2937,Person_Name_42,Person_Name_23,,Person_Name_47,0.3500,0.39588,Q3,16231.25,131808.70,1,6,4.0,False,False,True,False,0.0,71,36.0,56.0,True,133.0,0.996178


In [56]:
answer = x_test_pred.groupby("Opportunity_ID")["Stage","Prediction"].mean()
log_loss(answer["Stage"], answer["Prediction"])

0.06989312231509576

In [57]:
validation_file = "Validacion_ECI_2020.csv"
vali = pd.read_csv(validation_file)
validation = preprocess(vali)
leak = ["Opportunity_ID", "Sales_Contract_No"]

In [58]:
validation_contract = validation[validation.Has_Contract]
validation_nocontract = validation[~validation.Has_Contract]

In [61]:
pred_con = model_contract.predict(validation_contract.drop(leak, axis = 1))

In [62]:
pred_nc = model_nocontract.predict(validation_nocontract.drop(leak, axis = 1))

In [66]:
pred_con = pd.DataFrame(pred_con, index = validation_contract.index, columns = ["Prediction"])
answer_con = validation_contract.join(pred_con)
pred_nc = pd.DataFrame(pred_nc, index = validation_nocontract.index, columns = ["Prediction"])
answer_nc = validation_nocontract.join(pred_nc)

In [67]:
answer = pd.concat([answer_con, answer_nc])

In [73]:
res = answer.groupby("Opportunity_ID", as_index = False)["Prediction"].mean()

In [75]:
prev = pd.read_csv("acceptable/submission_23.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(res["Prediction"])

0.9766286071925385

In [77]:
prev = pd.read_csv("acceptable/submission_11.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(res["Prediction"])

0.9753483165067466

In [78]:
prev = pd.read_csv("submisson_ensamble_6.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(res["Prediction"])

0.9782733957210261

In [79]:
prev = pd.read_csv("submission_ensambler_1.csv", names = ["Opportunity_ID", "Prediction"])
prev["Prediction"].corr(res["Prediction"])

0.9797257027572639

In [80]:
res.to_csv("splitlgbm.csv", index = False, header = False)