In [87]:
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
import random
import time
import numpy as np


In [88]:
tweets = pd.read_csv("data/train_pre_processing_true_false.csv")
x_train, target = tweets.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],tweets.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]


In [89]:
def get_dic_acc():
    results = {}
    results['accuracy'] = []
    results['std'] = []
    results['time'] = []
    results['features'] = []
    results['model'] = []
    
    return results

In [90]:
principal_features_list = range(0,42)
secondary_features_list = range(42,len(x_train.columns))

def get_feature_sample():
    principal_features_sample_list = random.sample(principal_features_list, 35)
    secondary_features_sample_list = random.sample(secondary_features_list, 65)
    features_sample_list = []
    features_sample_list.extend(principal_features_sample_list)
    features_sample_list.extend(secondary_features_sample_list)
    features_sample_list.sort()
    
    return features_sample_list

In [91]:
def fit_model_cv(model,train_set,target):
    start = time.time()
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    
    n_scores = cross_val_score(model, train_set, target, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    return np.mean(n_scores),np.std(n_scores), (end - start)


In [92]:
def get_models_for_fit():
    xgb_model = xgb.XGBClassifier(random_state=1)
    light_model = LGBMClassifier(random_state = 1)
    rf_model = RandomForestClassifier(random_state=1)

    return [xgb_model,light_model,rf_model]

In [93]:
sample_list = []

results_dict = get_dic_acc()

for x in range(0,500):
    features_sample_list = []
    
    while not features_sample_list in sample_list:
        features_sample_list = get_feature_sample()
        
        if not features_sample_list in sample_list:
            sample_list.append(features_sample_list)

    models = get_models_for_fit()
    train_set = x_train.iloc[:,features_sample_list]
    
    for model in models:
        mean, std, timed = fit_model_cv(model,train_set,target)
        
        results_dict['accuracy'].append(mean)
        results_dict['std'].append(std)
        results_dict['time'].append(timed)
        results_dict['features'].append(features_sample_list)
        results_dict['model'].append(str(model))
    
results_df = pd.DataFrame(results_dict) 

    

In [132]:
results_df

Unnamed: 0,accuracy,std,time,features,model
0,0.790699,0.013486,92.759706,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1,0.788571,0.015089,6.990095,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","LGBMClassifier(boosting_type='gbdt', class_wei..."
2,0.789938,0.012442,25.098975,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","RandomForestClassifier(bootstrap=True, ccp_alp..."
3,0.792696,0.012842,69.955751,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","XGBClassifier(base_score=0.5, booster='gbtree'..."
4,0.791172,0.013941,6.225833,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
...,...,...,...,...,...
1495,0.791224,0.011108,5.547646,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
1496,0.791041,0.011719,21.192583,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","RandomForestClassifier(bootstrap=True, ccp_alp..."
1497,0.786390,0.013611,63.288443,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1498,0.784839,0.012647,6.059586,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","LGBMClassifier(boosting_type='gbdt', class_wei..."


In [147]:
#results_df.to_csv('data/tree_emsamble_default.csv', index=False)

best_features_default = results_df.groupby('model').apply(lambda x: x.nlargest(10,'accuracy')).features.to_frame().reset_index()

best_features_lgbm = []
best_features_xgb = []
best_features_rf = []

for model in best_features_default.iterrows():
    if 'LGBM' in model[1]['model']:
        best_features_lgbm.append(model[1]['features'])
        
    if 'Random' in model[1]['model']:
        best_features_rf.append(model[1]['features']) 
        
    if 'XGB' in model[1]['model']:
        best_features_xgb.append(model[1]['features']) 

# Pruebas buscando mejores hiperparámetros con LigthGBM

In [148]:
def get_grid_lgbm():
    n_estimators = [100,200,300,400,500] 
    learning_rate = [0.05,0.08,0.1]
    subsample = [0.1,0.5,1]
    max_depth = [-1,30,20,10]
    min_split_gain_leaf = [0.1,0.3,0.5]
    
    return {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain_leaf': min_split_gain_leaf,
               'num_leaves': num_leaves,
               'subsample': subsample}

In [144]:
grid = get_grid_lgbm()
results_lgbm_df = pd.DataFrame({})

for features in best_features_lgbm:
    
    light_model = LGBMClassifier(random_state = 1)
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    grid_serch_CV = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
    grid_serch_CV.fit(x_train.iloc[:,features], target)
    results_lgbm_partial_df = pd.DataFrame(grid_serch_CV.cv_results_)
    results_lgbm_partial_df['index_features'] = best_features_lgbm.index(features)
    results_lgbm_df = results_lgbm_df.append(results_lgbm_partial_df,ignore_index=True)
    

In [145]:
results_lgbm_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split44_test_score,split45_test_score,split46_test_score,split47_test_score,split48_test_score,split49_test_score,mean_test_score,std_test_score,rank_test_score,index_features
0,0.084775,0.021956,0.006517,0.000998,100,{'n_estimators': 100},0.604987,0.62336,0.597113,0.60184,...,0.622865,0.622865,0.624179,0.605782,0.632063,0.638633,0.61382,0.015741,1,0
1,0.081791,0.01742,0.006407,0.002968,100,{'n_estimators': 100},0.603675,0.641732,0.610236,0.626807,...,0.633377,0.636005,0.626807,0.617608,0.622865,0.600526,0.622224,0.014018,1,1
