In [87]:
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
import random
import time
import numpy as np


In [88]:
tweets = pd.read_csv("data/train_pre_processing_true_false.csv")
x_train, target = tweets.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],tweets.iloc[:,-1]
#elimina caracteres del nombre del feature que los modelos no pueden manejar
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]


In [89]:
def get_dic_acc():
    results = {}
    results['accuracy'] = []
    results['std'] = []
    results['time'] = []
    results['features'] = []
    results['model'] = []
    
    return results

Se crea una función que elija al azar 100 features del total conseguido (tomando en cuenta que la mayoría son generados por un one hot encoding)

In [90]:
principal_features_list = range(0,42)
secondary_features_list = range(42,len(x_train.columns))

def get_feature_sample():
    principal_features_sample_list = random.sample(principal_features_list, 35)
    secondary_features_sample_list = random.sample(secondary_features_list, 65)
    features_sample_list = []
    features_sample_list.extend(principal_features_sample_list)
    features_sample_list.extend(secondary_features_sample_list)
    features_sample_list.sort()
    
    return features_sample_list

Para entrenar cualquier modelo con CV 

In [91]:
def fit_model_cv(model,train_set,target):
    start = time.time()
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    
    n_scores = cross_val_score(model, train_set, target, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    return np.mean(n_scores),np.std(n_scores), (end - start)


Modelos a entrenar

In [92]:
def get_models_for_fit():
    xgb_model = xgb.XGBClassifier(random_state=1)
    light_model = LGBMClassifier(random_state = 1)
    rf_model = RandomForestClassifier(random_state=1)

    return [xgb_model,light_model,rf_model]

In [93]:
sample_list = []

results_dict = get_dic_acc()

for x in range(0,500):
    features_sample_list = []
    
    while not features_sample_list in sample_list:
        features_sample_list = get_feature_sample()
        
        if not features_sample_list in sample_list:
            sample_list.append(features_sample_list)

    models = get_models_for_fit()
    train_set = x_train.iloc[:,features_sample_list]
    
    for model in models:
        mean, std, timed = fit_model_cv(model,train_set,target)
        
        results_dict['accuracy'].append(mean)
        results_dict['std'].append(std)
        results_dict['time'].append(timed)
        results_dict['features'].append(features_sample_list)
        results_dict['model'].append(str(model))
    
results_df = pd.DataFrame(results_dict) 

    

In [132]:
results_df

Unnamed: 0,accuracy,std,time,features,model
0,0.790699,0.013486,92.759706,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1,0.788571,0.015089,6.990095,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","LGBMClassifier(boosting_type='gbdt', class_wei..."
2,0.789938,0.012442,25.098975,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","RandomForestClassifier(bootstrap=True, ccp_alp..."
3,0.792696,0.012842,69.955751,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","XGBClassifier(base_score=0.5, booster='gbtree'..."
4,0.791172,0.013941,6.225833,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
...,...,...,...,...,...
1495,0.791224,0.011108,5.547646,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
1496,0.791041,0.011719,21.192583,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","RandomForestClassifier(bootstrap=True, ccp_alp..."
1497,0.786390,0.013611,63.288443,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1498,0.784839,0.012647,6.059586,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","LGBMClassifier(boosting_type='gbdt', class_wei..."


In [147]:
#results_df.to_csv('data/tree_emsamble_default.csv', index=False)

best_features_default = results_df.groupby('model').apply(lambda x: x.nlargest(10,'accuracy')).features.to_frame().reset_index()

best_features_lgbm = []
best_features_xgb = []
best_features_rf = []

for model in best_features_default.iterrows():
    if 'LGBM' in model[1]['model']:
        best_features_lgbm.append(model[1]['features'])
        
    if 'Random' in model[1]['model']:
        best_features_rf.append(model[1]['features']) 
        
    if 'XGB' in model[1]['model']:
        best_features_xgb.append(model[1]['features']) 

# Pruebas buscando mejores hiperparámetros con LigthGBM

In [150]:
def get_grid_lgbm():
    n_estimators = [100,200,300,400,500] 
    learning_rate = [0.05,0.08,0.1]
    subsample = [0.1,0.5,1.0]
    max_depth = [-1,30,20,10]
    min_split_gain = [0.1,0.0,0.5]
    
    return {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain': min_split_gain,
               'subsample': subsample}

In [151]:
grid = get_grid_lgbm()
results_lgbm_df = pd.DataFrame({})

for features in best_features_lgbm:
    
    light_model = LGBMClassifier(random_state = 1)
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    grid_serch_CV = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
    grid_serch_CV.fit(x_train.iloc[:,features], target)
    results_lgbm_partial_df = pd.DataFrame(grid_serch_CV.cv_results_)
    results_lgbm_partial_df['index_features'] = best_features_lgbm.index(features)
    results_lgbm_df = results_lgbm_df.append(results_lgbm_partial_df,ignore_index=True)
    

In [158]:
#results_lgbm_df.to_csv('data/tree_emsamble_lgbm.csv', index=False)
results_lgbm_df.nlargest(10,'mean_test_score')


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_split_gain,param_n_estimators,param_subsample,params,...,split44_test_score,split45_test_score,split46_test_score,split47_test_score,split48_test_score,split49_test_score,mean_test_score,std_test_score,rank_test_score,index_features
5010,0.271299,0.03443,0.011451,0.001265,0.05,10,0.0,100,0.1,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.806833,0.801577,0.781866,0.822602,0.78318,0.791064,0.798002,0.012358,1,9
5011,0.27494,0.060042,0.011338,0.001421,0.05,10,0.0,100,0.5,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.806833,0.801577,0.781866,0.822602,0.78318,0.791064,0.798002,0.012358,1,9
5012,0.271106,0.033904,0.011475,0.001801,0.05,10,0.0,100,1.0,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.806833,0.801577,0.781866,0.822602,0.78318,0.791064,0.798002,0.012358,1,9
5025,0.285479,0.056002,0.0117,0.001088,0.05,10,0.5,100,0.1,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.810775,0.804205,0.788436,0.814717,0.781866,0.788436,0.797898,0.011715,4,9
5026,0.340606,0.165106,0.012522,0.001923,0.05,10,0.5,100,0.5,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.810775,0.804205,0.788436,0.814717,0.781866,0.788436,0.797898,0.011715,4,9
5027,0.266436,0.025918,0.011375,0.000958,0.05,10,0.5,100,1.0,"{'learning_rate': 0.05, 'max_depth': 10, 'min_...",...,0.810775,0.804205,0.788436,0.814717,0.781866,0.788436,0.797898,0.011715,4,9
4860,0.267616,0.031216,0.011078,0.001588,0.05,-1,0.1,100,0.1,"{'learning_rate': 0.05, 'max_depth': -1, 'min_...",...,0.805519,0.805519,0.78318,0.816032,0.781866,0.793693,0.797372,0.010918,7,9
4861,0.269755,0.0266,0.01097,0.000885,0.05,-1,0.1,100,0.5,"{'learning_rate': 0.05, 'max_depth': -1, 'min_...",...,0.805519,0.805519,0.78318,0.816032,0.781866,0.793693,0.797372,0.010918,7,9
4862,0.322414,0.090547,0.014055,0.012059,0.05,-1,0.1,100,1.0,"{'learning_rate': 0.05, 'max_depth': -1, 'min_...",...,0.805519,0.805519,0.78318,0.816032,0.781866,0.793693,0.797372,0.010918,7,9
4905,0.267597,0.027037,0.011381,0.002096,0.05,30,0.1,100,0.1,"{'learning_rate': 0.05, 'max_depth': 30, 'min_...",...,0.805519,0.805519,0.78318,0.816032,0.781866,0.793693,0.797372,0.010918,7,9


# Pruebas buscando mejores hiperparámetros con RF


In [153]:
def get_grid_rf():
    n_estimators = [100,500] 
    max_samples = [0.3,None,0.5]
    max_depth = [None,30,20,10]
    max_features = [5,10,25,'auto']
    
    return {
               'n_estimators': n_estimators,
               'max_samples': max_samples,
               'max_depth': max_depth,
               'max_features': max_features}

In [154]:
grid = get_grid_rf()
results_rf_df = pd.DataFrame({})

for features in best_features_rf:
    
    rf_model = RandomForestClassifier(random_state = 1)
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    grid_serch_CV = GridSearchCV(estimator = rf_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
    grid_serch_CV.fit(x_train.iloc[:,features], target)
    results_rf_partial_df = pd.DataFrame(grid_serch_CV.cv_results_)
    results_rf_partial_df['index_features'] = best_features_rf.index(features)
    results_rf_df = results_rf_df.append(results_rf_partial_df,ignore_index=True)

KeyboardInterrupt: 

In [None]:
results_rf_df

In [149]:
results_df.groupby('model').count()

Unnamed: 0_level_0,accuracy,std,time,features
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,\n importance_type='split', learning_rate=0.1, max_depth=-1,\n min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,\n n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,\n random_state=1, reg_alpha=0.0, reg_lambda=0.0, silent=True,\n subsample=1.0, subsample_for_bin=200000, subsample_freq=0)",500,500,500,500
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n criterion='gini', max_depth=None, max_features='auto',\n max_leaf_nodes=None, max_samples=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators=100,\n n_jobs=None, oob_score=False, random_state=1, verbose=0,\n warm_start=False)",500,500,500,500
"XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0,\n learning_rate=0.1, max_delta_step=0, max_depth=3,\n min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,\n nthread=None, objective='binary:logistic', random_state=1,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n silent=None, subsample=1, verbosity=1)",500,500,500,500
