In [87]:
from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
import random
import time
import numpy as np


In [88]:
tweets = pd.read_csv("data/train_pre_processing_true_false.csv")
x_train, target = tweets.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],tweets.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]


In [89]:
def get_dic_acc():
    results = {}
    results['accuracy'] = []
    results['std'] = []
    results['time'] = []
    results['features'] = []
    results['model'] = []
    
    return results

In [90]:
principal_features_list = range(0,42)
secondary_features_list = range(42,len(x_train.columns))

def get_feature_sample():
    principal_features_sample_list = random.sample(principal_features_list, 35)
    secondary_features_sample_list = random.sample(secondary_features_list, 65)
    features_sample_list = []
    features_sample_list.extend(principal_features_sample_list)
    features_sample_list.extend(secondary_features_sample_list)
    features_sample_list.sort()
    
    return features_sample_list

In [91]:
def fit_model_cv(model,train_set,target):
    start = time.time()
    cv = RepeatedStratifiedKFold(n_splits=10,n_repeats=5,random_state=1)
    
    n_scores = cross_val_score(model, train_set, target, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    return np.mean(n_scores),np.std(n_scores), (end - start)


In [92]:
def get_models_for_fit():
    xgb_model = xgb.XGBClassifier(random_state=1)
    light_model = LGBMClassifier(random_state = 1)
    rf_model = RandomForestClassifier(random_state=1)

    return [xgb_model,light_model,rf_model]

In [93]:
sample_list = []

results_dict = get_dic_acc()

for x in range(0,500):
    features_sample_list = []
    
    while not features_sample_list in sample_list:
        features_sample_list = get_feature_sample()
        
        if not features_sample_list in sample_list:
            sample_list.append(features_sample_list)

    models = get_models_for_fit()
    train_set = x_train.iloc[:,features_sample_list]
    
    for model in models:
        mean, std, timed = fit_model_cv(model,train_set,target)
        
        results_dict['accuracy'].append(mean)
        results_dict['std'].append(std)
        results_dict['time'].append(timed)
        results_dict['features'].append(features_sample_list)
        results_dict['model'].append(str(model))
    
results_df = pd.DataFrame(results_dict) 

    

In [94]:
results_df

Unnamed: 0,accuracy,std,time,features,model
0,0.790699,0.013486,92.759706,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1,0.788571,0.015089,6.990095,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","LGBMClassifier(boosting_type='gbdt', class_wei..."
2,0.789938,0.012442,25.098975,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14,...","RandomForestClassifier(bootstrap=True, ccp_alp..."
3,0.792696,0.012842,69.955751,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","XGBClassifier(base_score=0.5, booster='gbtree'..."
4,0.791172,0.013941,6.225833,"[0, 1, 3, 4, 7, 8, 9, 10, 12, 13, 14, 15, 17, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
...,...,...,...,...,...
1495,0.791224,0.011108,5.547646,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
1496,0.791041,0.011719,21.192583,"[0, 1, 2, 3, 4, 5, 7, 10, 11, 12, 13, 14, 15, ...","RandomForestClassifier(bootstrap=True, ccp_alp..."
1497,0.786390,0.013611,63.288443,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","XGBClassifier(base_score=0.5, booster='gbtree'..."
1498,0.784839,0.012647,6.059586,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15...","LGBMClassifier(boosting_type='gbdt', class_wei..."


In [97]:
#results_df.to_csv('data/tree_emsamble_default.csv', index=False)
results_df.nsma(10,'accuracy')

Unnamed: 0,accuracy,std,time,features,model
289,0.777169,0.012949,5.860996,"[0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 1...","LGBMClassifier(boosting_type='gbdt', class_wei..."
290,0.778825,0.013488,22.711047,"[0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 1...","RandomForestClassifier(bootstrap=True, ccp_alp..."
603,0.779586,0.013869,64.235552,"[0, 1, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, ...","XGBClassifier(base_score=0.5, booster='gbtree'..."
288,0.780321,0.011129,64.321241,"[0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 12, 13, 14, 1...","XGBClassifier(base_score=0.5, booster='gbtree'..."
604,0.780559,0.01413,6.237778,"[0, 1, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, ...","LGBMClassifier(boosting_type='gbdt', class_wei..."
283,0.780795,0.011262,5.940878,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15...","LGBMClassifier(boosting_type='gbdt', class_wei..."
284,0.781689,0.011033,22.038625,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15...","RandomForestClassifier(bootstrap=True, ccp_alp..."
592,0.781819,0.013665,5.835602,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","LGBMClassifier(boosting_type='gbdt', class_wei..."
605,0.782082,0.012943,21.513456,"[0, 1, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, ...","RandomForestClassifier(bootstrap=True, ccp_alp..."
1167,0.782424,0.011986,63.81632,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14...","XGBClassifier(base_score=0.5, booster='gbtree'..."
