In [449]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, GridSearchCV
import numpy as np
import time
import pandas as pd
from geopy.geocoders import Nominatim
import math
from matplotlib import pyplot
import re



In [450]:
tweets = pd.read_csv("data/train_pre_processing.csv")

# Pruebas solo con variables numéricas y booleanas

In [451]:
x_features, target = tweets.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],tweets.iloc[:,-1]

In [452]:
x_train, x_test, y_train, y_test = train_test_split(x_features, target, test_size=0.2, random_state=123)

# Pruebas con valores por default del LGBMClassifier

In [453]:
start = time.time()
light_model = LGBMClassifier(random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)
n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()


print('Accuracy: %.3f (std :%.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end-start)/60))


Accuracy: 0.693 (std :0.017). Time: 0.18


# Pruebas descartando una columna (todas las combinaciones)

In [17]:
x_train_columns = {}

for x in range(0,30):
    column_list = []
    for y in range(0,30):
        if y != x:
            column_list.append(y)
    x_train_columns[x] = column_list

def get_dic_acc():
    results = {}
    results['accuracy'] = []
    results['std'] = []
    results['time'] = []
    
    return results

In [7]:
no_column = get_dic_acc()
index_no_column = []


for x in range(0,30):
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_train_2 = x_train.iloc[:,x_train_columns[x]]
    n_scores = cross_val_score(light_model, x_train_2, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(x_train.columns[x])
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

pd.DataFrame(no_column, index=index_no_column).nlargest(10,'accuracy')     


Unnamed: 0,accuracy,std,time
total_4_ormore_words,0.693974,0.016061,0.136329
subjectivity_text,0.693826,0.016843,0.126967
total_5_ormore_words,0.693695,0.016156,0.135257
total_6_orless_words,0.69358,0.016874,0.142286
total_7_ormore_words,0.693399,0.015039,0.134584
total_8_words,0.69335,0.016938,0.135639
total_5_words,0.693202,0.015872,0.135478
total_7_words,0.692742,0.015241,0.1377
total_words,0.692693,0.016329,0.136584
total_7_orless_words,0.692562,0.015567,0.138968


realizo una prueba eliminando aquellas columnas que no disminuyeron el resultado de Accuracy con tres decimales 

In [8]:
x_train_2 = x_train.drop(columns=['total_4_ormore_words','subjectivity_text','total_5_ormore_words','total_6_orless_words','total_7_ormore_words','total_8_words','total_5_words']) 

start = time.time()

light_model = LGBMClassifier(random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)

n_scores = cross_val_score(light_model, x_train_2, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()

print('ACC: %.3f (desv:  %.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end - start)/60))

ACC: 0.693 (desv:  0.016). Time: 0.12


# Pruebas variando parámetros según https://machinelearningmastery.com/configure-gradient-boosting-algorithm/

# Pruebas variando n_estimators

In [454]:
n_estimators_best = 0
n_estimators_acc = 0
n_estimators_std = 0

n_estimators_dic = get_dic_acc()
index_n_estimators = []

for x in range(100,501,40):
    start = time.time()
    light_model = LGBMClassifier(random_state=1, n_estimators = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_n_estimators.append(x)
    n_estimators_dic['accuracy'].append(mean)
    n_estimators_dic['std'].append(np.std(std))
    n_estimators_dic['time'].append((end - start)/60)
    
    if (mean > n_estimators_acc) or (mean == n_estimators_acc and std < n_estimators_std): 
        n_estimators_best = x
        n_estimators_acc = mean
        n_estimators_std = std
 
    
print("Best n_estimators: %d" % (n_estimators_best))


Best n_estimators: 100


In [455]:
n_estimators_df = pd.DataFrame(n_estimators_dic, index=index_n_estimators).nlargest(20,'accuracy')
n_estimators_df

Unnamed: 0,accuracy,std,time
100,0.693235,0.0,0.137145
140,0.692151,0.0,0.203674
180,0.690049,0.0,0.282271
220,0.688982,0.0,0.279113
260,0.686404,0.0,0.376512
300,0.686256,0.0,0.357027
340,0.685107,0.0,0.421842
420,0.684959,0.0,0.513827
380,0.684401,0.0,0.58792
460,0.683563,0.0,0.581347


# Pruebas variando learning_rate

In [456]:
learning_rate_best = 0
learning_rate_acc = 0
learning_rate_std = 0

learning_rate_dic = get_dic_acc()
index_learning_rate = []

for x in range(1,11,1):
    start = time.time()
    x = x/100
    light_model = LGBMClassifier(random_state=1, n_estimators = n_estimators_best, learning_rate = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_learning_rate.append(x)
    learning_rate_dic['accuracy'].append(mean)
    learning_rate_dic['std'].append(np.std(std))
    learning_rate_dic['time'].append((end - start)/60)
    
    if (mean > learning_rate_acc) or (mean == learning_rate_acc and std < learning_rate_std): 
        learning_rate_best = x
        learning_rate_acc = mean
        learning_rate_std = std
 
    
print("Best learning_rate: %.3f" % (learning_rate_best))


Best learning_rate: 0.060


In [457]:
learning_rate_df = pd.DataFrame(learning_rate_dic, index=index_learning_rate).nlargest(20,'accuracy')
learning_rate_df

Unnamed: 0,accuracy,std,time
0.06,0.695796,0.0,0.164012
0.05,0.695501,0.0,0.154972
0.04,0.695074,0.0,0.164469
0.09,0.694663,0.0,0.143728
0.07,0.694581,0.0,0.15852
0.08,0.69353,0.0,0.146434
0.03,0.6933,0.0,0.163415
0.1,0.693235,0.0,0.144655
0.02,0.690049,0.0,0.17569
0.01,0.681346,0.0,0.171309


# Pruebas variando subsample


In [458]:
subsample_best = 0
subsample_acc = 0
subsample_std = 0

subsample_dic = get_dic_acc()
index_subsample = []

for x in range(1,11,1):
    start = time.time()
    x = x/10
    light_model = LGBMClassifier(random_state=1, n_estimators = n_estimators_best, learning_rate = learning_rate_best,subsample = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_subsample.append(x)
    subsample_dic['accuracy'].append(mean)
    subsample_dic['std'].append(np.std(std))
    subsample_dic['time'].append((end - start)/60)
    
    if (mean > subsample_acc) or (mean == subsample_acc and std < subsample_std): 
        subsample_best = x
        subsample_acc = mean
        subsample_std = std
 
    
print("Best subsample: %.2f" % (subsample_best))



Best subsample: 0.10


In [459]:
subsample_df = pd.DataFrame(subsample_dic, index=index_subsample).nlargest(20,'accuracy')
subsample_df


Unnamed: 0,accuracy,std,time
0.1,0.695796,0.0,0.152515
0.2,0.695796,0.0,0.155383
0.3,0.695796,0.0,0.163912
0.4,0.695796,0.0,0.149996
0.5,0.695796,0.0,0.15834
0.6,0.695796,0.0,0.149866
0.7,0.695796,0.0,0.154021
0.8,0.695796,0.0,0.155414
0.9,0.695796,0.0,0.16193
1.0,0.695796,0.0,0.150489


# Pruebas variando num_leaves


In [460]:
num_leaves_best = 0
num_leaves_acc = 0
num_leaves_std = 0

num_leaves_dic = get_dic_acc()
index_num_leaves = []

for x in range(21,42,2):
    start = time.time()
    light_model = LGBMClassifier(random_state=1, n_estimators = n_estimators_best, learning_rate = learning_rate_best,subsample = subsample_best, num_leaves = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_num_leaves.append(x)
    num_leaves_dic['accuracy'].append(mean)
    num_leaves_dic['std'].append(np.std(std))
    num_leaves_dic['time'].append((end - start)/60)
    
    if (mean > num_leaves_acc) or (mean == num_leaves_acc and std < num_leaves_std): 
        num_leaves_best = x
        num_leaves_acc = mean
        num_leaves_std = std
 
    
print("Best num_leaves: %d" % (num_leaves_best))



Best num_leaves: 41


In [461]:
num_leaves_df = pd.DataFrame(num_leaves_dic, index=index_num_leaves).nlargest(20,'accuracy')
num_leaves_df

Unnamed: 0,accuracy,std,time
41,0.696371,0.0,0.173512
37,0.696043,0.0,0.180911
35,0.695961,0.0,0.167752
31,0.695796,0.0,0.172101
39,0.695468,0.0,0.182489
33,0.695402,0.0,0.196915
23,0.695337,0.0,0.14781
25,0.694745,0.0,0.137749
27,0.694204,0.0,0.178376
29,0.694089,0.0,0.146089


# Pruebas variando max_depth


In [462]:
max_depth_best = 0
max_depth_acc = 0
max_depth_std = 0

max_depth_dic = get_dic_acc()
index_max_depth = []

for x in range(4,41,4):
    start = time.time()
    light_model = LGBMClassifier(random_state=1, n_estimators = n_estimators_best, learning_rate = learning_rate_best,subsample = subsample_best, num_leaves = num_leaves_best, max_depth = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_max_depth.append(x)
    max_depth_dic['accuracy'].append(mean)
    max_depth_dic['std'].append(np.std(std))
    max_depth_dic['time'].append((end - start)/60)
    
    if (mean > max_depth_acc) or (mean == max_depth_acc and std < max_depth_std): 
        max_depth_best = x
        max_depth_acc = mean
        max_depth_std = std
    
print("Best max_depth: %d" % (max_depth_best))



Best max_depth: 28


In [463]:
max_depth_df = pd.DataFrame(max_depth_dic, index=index_max_depth).nlargest(20,'accuracy') 
max_depth_df

Unnamed: 0,accuracy,std,time
28,0.696371,0.0,0.171964
32,0.696371,0.0,0.185959
36,0.696371,0.0,0.176535
40,0.696371,0.0,0.178573
24,0.69624,0.0,0.172617
20,0.695813,0.0,0.176105
16,0.695813,0.0,0.174271
12,0.695222,0.0,0.181733
8,0.694647,0.0,0.167256
4,0.687964,0.0,0.101823


# Pruebas variando min_split_gain


In [464]:
min_split_gain_best = 0
min_split_gain_acc = 0
min_split_gain_std = 0

min_split_gain_dic = get_dic_acc()
index_min_split_gain = []

for x in range(0,10,1):
    start = time.time()
    x = x/10
    light_model = LGBMClassifier(random_state=1, n_estimators = n_estimators_best, learning_rate = learning_rate_best,subsample = subsample_best, num_leaves = num_leaves_best, max_depth = max_depth_best,min_split_gain = x)
    cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

    n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    mean = np.mean(n_scores)
    std = np.std(n_scores)
    end = time.time()

    index_min_split_gain.append(x)
    min_split_gain_dic['accuracy'].append(mean)
    min_split_gain_dic['std'].append(np.std(std))
    min_split_gain_dic['time'].append((end - start)/60)
    
    if (mean > min_split_gain_acc) or (mean == min_split_gain_acc and std < min_split_gain_std): 
        min_split_gain_best = x
        min_split_gain_acc = mean
        min_split_gain_std = std
    
print("Best min_split_gain: %.2f" % (min_split_gain_best))



Best min_split_gain: 0.20


In [465]:
min_split_gain_df = pd.DataFrame(min_split_gain_dic, index=index_min_split_gain).nlargest(20,'accuracy') 
min_split_gain_df


Unnamed: 0,accuracy,std,time
0.2,0.696749,0.0,0.172912
0.0,0.696371,0.0,0.172689
0.1,0.696059,0.0,0.174648
0.3,0.695468,0.0,0.177527
0.6,0.695369,0.0,0.149954
0.8,0.694975,0.0,0.131465
0.7,0.69491,0.0,0.139592
0.4,0.694647,0.0,0.164154
0.5,0.693941,0.0,0.162242
0.9,0.693481,0.0,0.127322


# # Grid searh usando valores cercanos a los mejores parámetros encontrados anteriormente 

In [21]:
light_model = LGBMClassifier(random_state = 1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

n_estimators = n_estimators_df.nlargest(4,'accuracy').index.tolist() 
learning_rate = learning_rate_df.nlargest(4,'accuracy').index.tolist() 
subsample = subsample_df.nlargest(3,'accuracy').index.tolist()
num_leaves = num_leaves_df.nlargest(3,'accuracy').index.tolist()
max_depth = max_depth_df.nlargest(3,'accuracy').index.tolist()
min_split_gain_leaf = min_split_gain_df.nlargest(3,'accuracy').index.tolist()


grid = {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain_leaf': min_split_gain_leaf,
               'num_leaves': num_leaves,
               'subsample': subsample}
start = time.time()
grid_serch_CV = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
grid_serch_CV.fit(x_train, y_train)
end = time.time()

In [22]:
pd.DataFrame(grid_serch_CV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_split_gain_leaf,param_n_estimators,param_num_leaves,param_subsample,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
0,0.215527,0.042211,0.008707,0.009469,0.06,12,0,100,41,0.1,...,0.691297,0.694581,0.686371,0.706076,0.679803,0.683087,0.707718,0.695041,0.016181,19
1,0.209157,0.023953,0.007744,0.001197,0.06,12,0,100,41,0.2,...,0.691297,0.694581,0.686371,0.706076,0.679803,0.683087,0.707718,0.695041,0.016181,19
2,0.208894,0.031604,0.007822,0.001401,0.06,12,0,100,41,0.3,...,0.691297,0.694581,0.686371,0.706076,0.679803,0.683087,0.707718,0.695041,0.016181,19
3,0.171804,0.020180,0.006941,0.000722,0.06,12,0,100,27,0.1,...,0.673235,0.691297,0.694581,0.702791,0.676519,0.681445,0.717570,0.692069,0.016180,955
4,0.176943,0.028129,0.007167,0.001478,0.06,12,0,100,27,0.2,...,0.673235,0.691297,0.694581,0.702791,0.676519,0.681445,0.717570,0.692069,0.016180,955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,0.373623,0.097637,0.011372,0.006045,0.05,28,0.6,220,27,0.2,...,0.681445,0.688013,0.702791,0.689655,0.691297,0.681445,0.707718,0.693432,0.016881,388
1292,0.366068,0.060428,0.010908,0.001260,0.05,28,0.6,220,27,0.3,...,0.681445,0.688013,0.702791,0.689655,0.691297,0.681445,0.707718,0.693432,0.016881,388
1293,0.442704,0.082364,0.011890,0.001797,0.05,28,0.6,220,37,0.1,...,0.684729,0.699507,0.704433,0.696223,0.684729,0.696223,0.712644,0.694745,0.015732,46
1294,0.450730,0.094299,0.011934,0.002567,0.05,28,0.6,220,37,0.2,...,0.684729,0.699507,0.704433,0.696223,0.684729,0.696223,0.712644,0.694745,0.015732,46


In [23]:
geolocator = Nominatim(user_agent="orga_datos")
def getGeoData(x):
    if pd.isna(x):
        return pd.NA
    try:
        l = geolocator.geocode(x, timeout=20)
    except:
        return pd.NA
    
    if l == None:
        return pd.NA
    return (l.address, l.latitude, l.longitude)

address = tweets.location.transform(lambda x: getGeoData(x))

In [24]:
def get_column_with_cv_mean_encoding(column_name):
    data = x_train.join(y_train)
    total_true = data.groupby(column_name).target.transform(sum).fillna(0)
    dic_total = data.groupby(column_name)[column_name].count().to_dict()
    total = data[column_name].transform(lambda x: dic_total[x])
    total = total.transform(lambda x: x + 1 if x == 1 else x)
    
    return (total_true - data.target) / (total - 1)


In [25]:
x_train['latitud'] = address.transform(lambda x: 0 if pd.isna(x) else x[1]).astype('float64')
x_train['longitud'] = address.transform(lambda x:  0 if pd.isna(x) else x[2]).astype('float64')

x_train['country'] = address.transform(lambda x: 'unknown' if pd.isna(x) else x[0].split(",")[len(x[0].split(","))-1])
x_train['city'] = address.transform(lambda x: 'unknown' if pd.isna(x) else ('unknown' if len(x[0].split(",")) < 2 else x[0].split(",")[len(x[0].split(","))-2]))
x_train['keyword_grouped'] = tweets['keyword_grouped'].fillna('unknown')

x_train['country_cv_mean'] = get_column_with_cv_mean_encoding('country')
x_train['city_cv_mean'] = get_column_with_cv_mean_encoding('city')
x_train['keyword_cv_mean'] = get_column_with_cv_mean_encoding('keyword_grouped')

country_cv_mean_dict = x_train.groupby('country').country_cv_mean.mean().to_dict()
city_cv_mean_dict = x_train.groupby('city').city_cv_mean.mean().to_dict()
keyword_cv_mean_dict = x_train.groupby('keyword_grouped').keyword_cv_mean.mean().to_dict()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [26]:
x_train = x_train.select_dtypes(include=['float64','int64','bool'])
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6090 entries, 6445 to 3582
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   total_words           6090 non-null   int64  
 1   len_text              6090 non-null   int64  
 2   total_upper_chars     6090 non-null   int64  
 3   total_numbers_chars   6090 non-null   int64  
 4   total_special_chars   6090 non-null   int64  
 5   total_common_chars    6090 non-null   int64  
 6   contain_question      6090 non-null   bool   
 7   contain_link          6090 non-null   bool   
 8   contain_hashtag       6090 non-null   bool   
 9   contain_upper_words   6090 non-null   bool   
 10  total_3_words         6090 non-null   int64  
 11  total_4_words         6090 non-null   int64  
 12  total_5_words         6090 non-null   int64  
 13  total_6_words         6090 non-null   int64  
 14  total_7_words         6090 non-null   int64  
 15  total_8_words     

# Pruebas incluyendo nuevos features con hiperparámetros por default 

In [27]:
start = time.time()

light_model = LGBMClassifier(random_state = 1)
cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)

n_scores = cross_val_score(light_model, x_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()

print('ACC: %.3f (desv:  %.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end - start)/60))

ACC: 0.983 (desv:  0.005). Time: 0.44


In [28]:
x_test['latitud'] = address.transform(lambda x: 0 if pd.isna(x) else x[1]).astype('float64')
x_test['longitud'] = address.transform(lambda x:  0 if pd.isna(x) else x[2]).astype('float64')

x_test['country'] = address.transform(lambda x: 'unknown' if pd.isna(x) else x[0].split(",")[len(x[0].split(","))-1])
x_test['city'] = address.transform(lambda x: 'unknown' if pd.isna(x) else ('unknown' if len(x[0].split(",")) < 2 else x[0].split(",")[len(x[0].split(","))-2]))
x_test['keyword_grouped'] = tweets['keyword_grouped'].fillna('unknown')

x_test['country_cv_mean'] = x_test.country.transform(lambda x: country_cv_mean_dict[x] if x in country_cv_mean_dict else country_cv_mean_dict['unknown'])
x_test['city_cv_mean'] = x_test.city.transform(lambda x: city_cv_mean_dict[x] if x in city_cv_mean_dict else city_cv_mean_dict['unknown'])
x_test['keyword_cv_mean'] = x_test.keyword_grouped.transform(lambda x: keyword_cv_mean_dict[x] if x in keyword_cv_mean_dict else keyword_cv_mean_dict['unknown'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

In [29]:
x_test = x_test.select_dtypes(include=['float64','int64','bool'])
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds = light_model.predict(x_test)
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.586343


pruebas sacando features relacionados a location

In [30]:
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train.drop(columns=['city_cv_mean','country_cv_mean','longitud','latitud']), y_train)
preds = light_model.predict(x_test.drop(columns=['city_cv_mean','country_cv_mean','longitud','latitud']))
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.676953


# Pruebas con diferentes hiperparámetros (usando los mejores anteriores)

In [31]:
light_model = LGBMClassifier(random_state = 1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

n_estimators = n_estimators_df.nlargest(4,'accuracy').index.tolist() 
learning_rate = learning_rate_df.nlargest(4,'accuracy').index.tolist() 
subsample = subsample_df.nlargest(3,'accuracy').index.tolist()
num_leaves = num_leaves_df.nlargest(3,'accuracy').index.tolist()
max_depth = max_depth_df.nlargest(3,'accuracy').index.tolist()
min_split_gain_leaf = min_split_gain_df.nlargest(3,'accuracy').index.tolist()


grid = {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain_leaf': min_split_gain_leaf,
               'num_leaves': num_leaves,
               'subsample': subsample}
start = time.time()
grid_serch_CV = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
grid_serch_CV.fit(x_train, y_train)
end = time.time()

In [33]:
pd.DataFrame(grid_serch_CV.cv_results_).nlargest(10,'mean_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_split_gain_leaf,param_n_estimators,param_num_leaves,param_subsample,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
786,0.551519,0.057377,0.011268,0.001505,0.08,20,0.0,220,27,0.1,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
787,0.548723,0.073697,0.011463,0.001613,0.08,20,0.0,220,27,0.2,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
788,0.538568,0.038767,0.011131,0.000873,0.08,20,0.0,220,27,0.3,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
822,0.545359,0.050202,0.011132,0.000984,0.08,20,0.1,220,27,0.1,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
823,0.544629,0.087835,0.011456,0.001509,0.08,20,0.1,220,27,0.2,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
824,0.537407,0.040648,0.011247,0.001563,0.08,20,0.1,220,27,0.3,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
858,0.534416,0.03756,0.011758,0.002347,0.08,20,0.6,220,27,0.1,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
859,0.546641,0.056547,0.01136,0.002424,0.08,20,0.6,220,27,0.2,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
860,0.537269,0.036335,0.011371,0.001882,0.08,20,0.6,220,27,0.3,...,0.98358,0.980296,0.990148,0.988506,0.985222,0.990148,0.985222,0.985911,0.004867,1
894,0.535775,0.038065,0.011401,0.002241,0.08,28,0.0,220,27,0.1,...,0.980296,0.980296,0.98358,0.988506,0.985222,0.990148,0.985222,0.985911,0.0049,1


# Pruebas con otros features (sin los derivados de location anteriores y sin los cv mean encoding)

In [507]:
tweets_2 = pd.read_csv("data/train_pre_processing_2.csv")

In [508]:
x_features_2, target_2 = tweets_2.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],tweets_2.iloc[:,-1]

In [509]:
x_2_train, x_2_test, y_2_train, y_2_test = train_test_split(x_features_2, target_2, test_size=0.2, random_state=123)

In [510]:
x_2_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6090 entries, 6445 to 3582
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   text_contain_keyword               6090 non-null   bool   
 1   total_words                        6090 non-null   int64  
 2   len_text                           6090 non-null   int64  
 3   total_upper_chars                  6090 non-null   int64  
 4   total_numbers_chars                6090 non-null   int64  
 5   total_special_chars                6090 non-null   int64  
 6   total_common_chars                 6090 non-null   int64  
 7   contain_question                   6090 non-null   bool   
 8   contain_link                       6090 non-null   bool   
 9   contain_hashtag                    6090 non-null   bool   
 10  contain_upper_words                6090 non-null   bool   
 11  total_3_words                      6090 non-null   in

pruebas con valores por default

In [511]:
start = time.time()
light_model = LGBMClassifier(random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)
n_scores = cross_val_score(light_model, x_2_train, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()


print('Accuracy: %.3f (std :%.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end-start)/60))

Accuracy: 0.712 (std :0.016). Time: 0.25


# Grid search con los mejores parámetros anteriores

In [None]:
light_model = LGBMClassifier(random_state = 1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

n_estimators = n_estimators_df.nlargest(4,'accuracy').index.tolist() 
learning_rate = learning_rate_df.nlargest(4,'accuracy').index.tolist() 
subsample = subsample_df.nlargest(3,'accuracy').index.tolist()
num_leaves = num_leaves_df.nlargest(3,'accuracy').index.tolist()
max_depth = max_depth_df.nlargest(3,'accuracy').index.tolist()
min_split_gain_leaf = min_split_gain_df.nlargest(3,'accuracy').index.tolist()


grid = {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain_leaf': min_split_gain_leaf,
               'num_leaves': num_leaves,
               'subsample': subsample}
start = time.time()
grid_serch_CV_3 = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
grid_serch_CV_3.fit(x_2_train, y_2_train)
end = time.time()

In [None]:
print((end-start)/3600)
pd.DataFrame(grid_serch_CV_3.cv_results_).nlargest(10,'mean_test_score')

# Pruebas eliminando features

Eliminando una sola columna

In [28]:
x_train_columns = {}

for x in range(0,len(x_2_train.columns)):
    column_list = []
    for y in range(0,len(x_2_train.columns)):
        if y != x:
            column_list.append(y)
    x_train_columns[x] = column_list

In [29]:
no_column = get_dic_acc()
index_no_column = []


for x in range(0,len(x_2_train.columns)):
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_2_train_2 = x_2_train.iloc[:,x_train_columns[x]]
    n_scores = cross_val_score(light_model, x_2_train_2, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(x_2_train.columns[x])
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

one_column_deleted = pd.DataFrame(no_column, index=index_no_column) 


In [30]:
one_column_deleted.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
total_7_words,0.731199,0.015945,0.175553
total_6_ormore_words,0.730837,0.014928,0.176326
total_5_orless_words,0.730772,0.015703,0.1699
text_best_similarity_location,0.730657,0.015656,0.176485
total_4_ormore_words,0.730542,0.015892,0.174773
len_location_cero_default,0.730427,0.014328,0.178933
total_upper_chars,0.730263,0.0154,0.180205
total_6_orless_words,0.730131,0.017886,0.182965
contain_hashtag,0.73,0.014951,0.185678
total_3_ormore_words,0.729918,0.016253,0.185775


Eliminando dos columnas

In [32]:
def get_all_two_columns(column_list):
    two_columns_list = []
    for x in range(0,len(column_list)):
        for y in range(1,len(column_list)-x):
            columns = []
            columns.append(column_list[x])
            columns.append(column_list[x+y])
            two_columns_list.append(columns)
    return two_columns_list
            

In [33]:
no_column = get_dic_acc()
index_no_column = []

columns_to_delete_list = get_all_two_columns(range(0,len(x_2_train.columns)))

for x in columns_to_delete_list:
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_2_train_2 = x_2_train.drop(columns= x_2_train.iloc[:,x].columns.tolist())
    n_scores = cross_val_score(light_model, x_2_train_2, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(", ".join(x_2_train.iloc[:,x].columns.tolist()))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

two_column_deleted = pd.DataFrame(no_column, index=index_no_column) 

In [69]:
two_column_deleted.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"total_6_orless_words, subjectivity_text",0.732562,0.016187,0.158562
"subjectivity_text, text_best_similarity_location",0.73225,0.016499,0.156382
"text_similarity_location, text_best_similarity_location",0.732036,0.015768,0.166033
"total_common_chars, total_4_words",0.731823,0.017137,0.16527
"total_4_ormore_words, subjectivity_text",0.731724,0.015787,0.158264
"total_7_words, text_similarity_location",0.731675,0.016149,0.166719
"total_4_orless_words, len_location_mean_default",0.731675,0.016707,0.167279
"total_7_words, total_8_words",0.731626,0.016029,0.169751
"total_7_words, subjectivity_text",0.731609,0.015812,0.159254
"total_4_words, total_8_words",0.731576,0.016034,0.170151


Se toman 10 de los features en cuya ausencia en la prueba  anterior no empeoró los resultados y se hacen pruebas eliminando todas las combinaciones posibles de estos 10 features

In [74]:
def subconjuntos(c):
    if len(c) == 0:
        return [[]]
    r = subconjuntos(c[:-1])
    return r + [s + [c[-1]] for s in r]

bad_columns = subconjuntos(['total_6_orless_words', 'subjectivity_text', 'text_best_similarity_location','text_similarity_location','total_common_chars', 'total_4_words','total_4_ormore_words','total_7_words','total_4_orless_words', 'len_location_mean_default'])
bad_columns.remove([])


In [75]:
no_column = get_dic_acc()
index_no_column = []

for x in bad_columns:
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_2_train_2 = x_2_train.drop(columns= x)
    n_scores = cross_val_score(light_model, x_2_train_2, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

all_bad_column_deleted = pd.DataFrame(no_column, index=index_no_column) 


In [76]:
all_bad_column_deleted.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_words, total_4_ormore_words, total_7_words, total_4_orless_words, len_location_mean_default",0.735222,0.015931,0.150535
"subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_ormore_words, total_7_words, total_4_orless_words, len_location_mean_default",0.735172,0.014615,0.144412
"subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_words, total_7_words, total_4_orless_words, len_location_mean_default",0.73509,0.014833,0.143564
"total_6_orless_words, subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_words, total_7_words, total_4_orless_words, len_location_mean_default",0.734762,0.015493,0.149217
"subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_words, total_4_ormore_words, total_7_words, len_location_mean_default",0.734663,0.016667,0.145873
"total_6_orless_words, subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_words, total_4_ormore_words, total_7_words, total_4_orless_words, len_location_mean_default",0.734647,0.015384,0.138616
"total_6_orless_words, subjectivity_text, text_best_similarity_location, text_similarity_location, total_common_chars, total_4_words, total_7_words, total_4_orless_words, len_location_mean_default",0.734614,0.015629,0.141391
"subjectivity_text, text_best_similarity_location, text_similarity_location, total_4_ormore_words, total_7_words, len_location_mean_default",0.734516,0.016164,0.147075
"subjectivity_text, text_similarity_location, total_4_words, total_4_ormore_words, total_7_words, len_location_mean_default",0.734401,0.01573,0.147504
"total_6_orless_words, subjectivity_text, text_best_similarity_location, total_4_words, total_4_ormore_words, total_7_words, len_location_mean_default",0.734351,0.016236,0.146812


# Eliminación de features que añadieron mayor cantidad de ruido

In [512]:
x_2_train.drop(columns=['subjectivity_text', 'text_best_similarity_location', 'text_similarity_location', 'total_4_words', 'total_4_ormore_words', 'total_7_words', 'total_4_orless_words', 'len_location_mean_default'],inplace = True)
x_2_test.drop( columns=['subjectivity_text', 'text_best_similarity_location', 'text_similarity_location', 'total_4_words', 'total_4_ormore_words', 'total_7_words', 'total_4_orless_words', 'len_location_mean_default'],inplace = True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Siguiendo la linea del TP1, sabemos que hay palabras más frecuentes en tweets verdaderos. No es posible usar las encontradas en el TP1 debido a que estos features tienen conocimiento del target y corresponden al mismo set que se dividió train y test set, por lo tanto generaremos unos nuevos a partir del train set. 

# Generación de features a partir de las palabras más frecuentes

In [540]:
words_dict = {}
for x in x_2_train.join(tweets_2.loc[:,['text','target']]).loc[:,['text','target']].iterrows():
    for word in re.split(' |\'|\*|\n|:|#|@|-|\?|\.|,|[|]|!|¡',x[1]['text']):
        word = word.lower()
        if len(word) < 4:
            continue
        if not word in words_dict:
            words_dict[word] = [0,0]
        if x[1]['target'] == 1:
            words_dict[word][0] = words_dict[word][0] + 1
        else:
            words_dict[word][1] = words_dict[word][1] + 1

            

In [541]:
words_df = pd.DataFrame(words_dict,index=['total_target_true','total_target_false']).transpose()
words_df = words_df.loc[(words_df.total_target_true + words_df.total_target_false) > 10]

In [539]:
words_100_true = []
words_100_false = []
words_90_true = []
words_90_false = []
words_85_true = []
words_85_false = []
words_80_true = []
words_80_false = []
words_75_true = []
words_75_false = []
words_70_true = []
words_70_false = []


for word in words_df.iterrows():
    false = word[1]['total_target_false']
    true = word[1]['total_target_true']
    
    if true == 0:
        words_100_false.append(word[0])
        
    if false == 0:
        words_100_true.append(word[0])
        
    if true / (true + false) >= 0.9:
        words_90_true.append(word[0])

    if false / (true + false) >= 0.9:
        words_90_false.append(word[0])
        
    if true / (true + false) >= 0.85:
        words_85_true.append(word[0])

    if false / (true + false) >= 0.85:
        words_85_false.append(word[0])
        
    if true / (true + false) >= 0.8:
        words_80_true.append(word[0])

    if false / (true + false) >= 0.8:
        words_80_false.append(word[0])

    if true / (true + false) >= 0.75:
        words_75_true.append(word[0])

    if false / (true + false) >= 0.75:
        words_75_false.append(word[0])
        
    if true / (true + false) >= 0.7:
        words_70_true.append(word[0])

    if false / (true + false) >= 0.7:
        words_70_false.append(word[0])

        


        

173

In [516]:
def text_contain_word_list(s,l):
    for word in l:
        if word.lower() in s.lower():
            return True
    return False

In [517]:
    
x_2_train['contain_words_100_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_100_true))
x_2_train['contain_words_100_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_100_false))
x_2_train['contain_words_90_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_90_true))
x_2_train['contain_words_90_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_90_false))
x_2_train['contain_words_85_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_85_true))
x_2_train['contain_words_85_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_85_false))
x_2_train['contain_words_80_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_80_true))
x_2_train['contain_words_80_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_80_false))
x_2_train['contain_words_75_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_75_true))
x_2_train['contain_words_75_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_75_false))
x_2_train['contain_words_70_true'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_70_true))
x_2_train['contain_words_70_false'] = x_2_train.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_70_false))
                                                                                     
                                                                                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [518]:
start = time.time()
light_model = LGBMClassifier(random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)
n_scores = cross_val_score(light_model, x_2_train, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()


print('Accuracy: %.3f (std :%.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end-start)/60))

Accuracy: 0.790 (std :0.016). Time: 0.20


In [519]:
    
x_2_test['contain_words_100_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_100_true))
x_2_test['contain_words_100_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_100_false))
x_2_test['contain_words_90_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_90_true))
x_2_test['contain_words_90_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_90_false)) 
x_2_test['contain_words_85_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_85_true))
x_2_test['contain_words_85_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_85_false))
x_2_test['contain_words_80_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_80_true))
x_2_test['contain_words_80_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_80_false))
x_2_test['contain_words_75_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_75_true))
x_2_test['contain_words_75_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_75_false))
x_2_test['contain_words_70_true'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_70_true))
x_2_test['contain_words_70_false'] = x_2_test.join(tweets_2.loc[:,['text']]).text.transform(lambda x: text_contain_word_list(x,words_70_false))                                                                                     
                                                                                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [520]:
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_2_train, y_2_train)
preds = light_model.predict(x_2_test)
acc = accuracy_score(preds,y_2_test)
print("ACC: %f" % (acc))

ACC: 0.769534


# Se aplica otra ronda de features que pudieran estar causando ruido

In [443]:
no_column = get_dic_acc()
index_no_column = []

columns_to_delete_list = get_all_two_columns(range(0,len(x_2_train.columns)))

for x in columns_to_delete_list:
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_2_train_2 = x_2_train.drop(columns= x_2_train.iloc[:,x].columns.tolist())
    n_scores = cross_val_score(light_model, x_2_train_2, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(", ".join(x_2_train.iloc[:,x].columns.tolist()))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

two_column_deleted_2 = pd.DataFrame(no_column, index=index_no_column)

In [444]:
two_column_deleted_2.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"len_text, contain_words_75_false",0.792348,0.015111,0.207089
"total_8_words, contain_words_85_true",0.791888,0.014553,0.159134
"total_8_words, total_7_ormore_words",0.791806,0.015795,0.160728
"contain_words_100_false, contain_words_85_true",0.791511,0.015161,0.160883
"total_common_chars, total_7_orless_words",0.791494,0.015728,0.160678
"total_8_ormore_words, contain_words_100_true",0.791478,0.014966,0.159695
"contain_words_100_false, contain_words_75_false",0.791429,0.014353,0.160631
"total_upper_chars, text_contain_keyword_similarity",0.791396,0.015688,0.178859
"total_special_chars, text_contain_keyword_similarity",0.791379,0.015052,0.163718
"total_7_ormore_words, total_7_orless_words",0.791379,0.013986,0.163535


In [445]:
bad_columns = subconjuntos(['len_text', 'contain_words_75_false','total_8_words', 'contain_words_85_true', 'total_7_ormore_words','contain_words_100_false','total_common_chars', 'total_7_orless_words','total_8_ormore_words', 'contain_words_100_true'])
bad_columns.remove([])

no_column = get_dic_acc()
index_no_column = []

for x in bad_columns:
    start = time.time()

    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    x_2_train_2 = x_2_train.drop(columns= x)
    n_scores = cross_val_score(light_model, x_2_train_2, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)

all_bad_column_deleted_2 = pd.DataFrame(no_column, index=index_no_column) 

In [446]:
all_bad_column_deleted_2.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"len_text, contain_words_75_false, total_8_words, total_common_chars, total_8_ormore_words",0.792594,0.01583,0.148443
"len_text, contain_words_75_false, total_7_ormore_words, total_common_chars, total_7_orless_words",0.792365,0.014527,0.149968
"len_text, contain_words_75_false",0.792348,0.015111,0.177048
"len_text, contain_words_75_false, total_7_ormore_words, contain_words_100_false, total_common_chars, total_7_orless_words",0.792332,0.014605,0.147492
"contain_words_85_true, total_7_ormore_words, total_common_chars, total_8_ormore_words",0.792315,0.01469,0.154439
"len_text, contain_words_75_false, contain_words_85_true, total_7_ormore_words, total_common_chars, total_7_orless_words",0.792184,0.013999,0.144476
"len_text, contain_words_75_false, contain_words_85_true, contain_words_100_false, total_common_chars, total_8_ormore_words",0.792167,0.015,0.147712
"contain_words_75_false, total_7_ormore_words, contain_words_100_false, total_7_orless_words, total_8_ormore_words, contain_words_100_true",0.792167,0.015742,0.152341
"len_text, contain_words_75_false, total_8_words, contain_words_85_true, total_7_ormore_words, total_common_chars, total_7_orless_words",0.792151,0.015052,0.142735
"len_text, total_8_words, total_7_ormore_words, total_common_chars",0.792151,0.014828,0.150232


Se eliminan las columnas más ruidosas

In [521]:
x_2_train.drop(columns=['len_text', 'contain_words_75_false', 'total_7_ormore_words', 'contain_words_100_false', 'total_common_chars', 'total_7_orless_words'],inplace = True)
x_2_test.drop( columns=['len_text', 'contain_words_75_false', 'total_7_ormore_words', 'contain_words_100_false', 'total_common_chars', 'total_7_orless_words'],inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


# Gridsearch con mejores parámetros y mejores features

In [466]:
light_model = LGBMClassifier(random_state = 1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)

n_estimators = n_estimators_df.nlargest(4,'accuracy').index.tolist() 
learning_rate = learning_rate_df.nlargest(4,'accuracy').index.tolist() 
subsample = subsample_df.nlargest(3,'accuracy').index.tolist()
num_leaves = num_leaves_df.nlargest(3,'accuracy').index.tolist()
max_depth = max_depth_df.nlargest(3,'accuracy').index.tolist()
min_split_gain_leaf = min_split_gain_df.nlargest(3,'accuracy').index.tolist()


grid = {
               'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'max_depth': max_depth,
               'min_split_gain_leaf': min_split_gain_leaf,
               'num_leaves': num_leaves,
               'subsample': subsample}
start = time.time()
grid_serch_CV_4 = GridSearchCV(estimator = light_model, param_grid = grid, cv = cv, n_jobs = 2, scoring = 'accuracy')
grid_serch_CV_4.fit(x_2_train, y_2_train)
end = time.time()

In [528]:
pd.DataFrame(grid_serch_CV_4.cv_results_).nlargest(10,'mean_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_min_split_gain_leaf,param_n_estimators,param_num_leaves,param_subsample,...,split93_test_score,split94_test_score,split95_test_score,split96_test_score,split97_test_score,split98_test_score,split99_test_score,mean_test_score,std_test_score,rank_test_score
330,0.231927,0.021062,0.007535,0.000652,0.05,28,0.2,100,35,0.1,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
331,0.232341,0.022094,0.007506,0.000624,0.05,28,0.2,100,35,0.2,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
332,0.288215,0.082198,0.009127,0.004239,0.05,28,0.2,100,35,0.3,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
366,0.244358,0.049801,0.007718,0.001353,0.05,28,0.0,100,35,0.1,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
367,0.235847,0.021975,0.007598,0.00088,0.05,28,0.0,100,35,0.2,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
368,0.23352,0.02378,0.007645,0.000677,0.05,28,0.0,100,35,0.3,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
402,0.233315,0.024634,0.007691,0.001228,0.05,28,0.1,100,35,0.1,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
403,0.232805,0.02223,0.007553,0.000708,0.05,28,0.1,100,35,0.2,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
404,0.23488,0.032439,0.007711,0.000982,0.05,28,0.1,100,35,0.3,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1
438,0.235621,0.027564,0.007701,0.000729,0.05,32,0.2,100,35,0.1,...,0.807882,0.794745,0.802956,0.809524,0.779967,0.778325,0.83087,0.794647,0.015525,1


# Pruebas haciendo operaciones aritmeticas con cada dos columnas (todas las combinaciones)

In [583]:
pair_number_columns = get_all_two_columns(x_2_train.select_dtypes(include=['float64','int64']).columns)

Suma de cada par de columnas

In [584]:
no_column = get_dic_acc()
index_no_column = []

for x in pair_number_columns:
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    x_train_plus = x_2_train.copy()
    x_train_plus['plus_column'] = x_train_plus[x[0]] + x_train_plus[x[1]]
    
    n_scores = cross_val_score(light_model, x_train_plus, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_plus = pd.DataFrame(no_column, index=index_no_column)


In [585]:
results_plus.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"total_special_chars, text_best_similarity_keyword",0.793054,0.015392,0.16546
"total_3_orless_words, total_6_orless_words",0.792972,0.013598,0.163645
"total_5_ormore_words, text_best_similarity_keyword",0.79289,0.013796,0.161147
"total_3_ormore_words, text_best_similarity_keyword",0.792824,0.014866,0.174094
"total_special_chars, total_3_ormore_words",0.79266,0.014075,0.156964
"total_special_chars, total_6_ormore_words",0.79266,0.014999,0.161902
"total_5_words, ratio_short_big_words",0.792578,0.016346,0.169203
"total_8_ormore_words, polarity_text",0.792512,0.015875,0.163771
"total_8_ormore_words, text_best_similarity_keyword",0.792447,0.015726,0.168683
"total_3_words, total_3_ormore_words",0.79243,0.015044,0.165958


Resta de columnas

In [586]:
no_column = get_dic_acc()
index_no_column = []

for x in pair_number_columns:
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    x_train_minus = x_2_train.copy()
    x_train_minus['minus_column_1'] = x_train_minus[x[0]] - x_train_minus[x[1]]
    x_train_minus['minus_column_2'] = x_train_minus[x[1]] - x_train_minus[x[0]]

    n_scores = cross_val_score(light_model, x_train_minus, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_minus = pd.DataFrame(no_column, index=index_no_column)


In [587]:
results_minus.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"total_numbers_chars, total_special_chars",0.792578,0.015321,0.15971
"total_8_words, text_best_similarity_keyword",0.792496,0.014767,0.244192
"total_3_ormore_words, text_best_similarity_keyword",0.792479,0.015662,0.188715
"text_similarity_keyword, ratio_short_big_words",0.79243,0.014368,0.184998
"total_numbers_chars, stopword_count",0.792397,0.014124,0.16709
"total_words, total_8_ormore_words",0.792348,0.014741,0.165955
"total_8_words, total_8_orless_words",0.792348,0.014741,0.171366
"total_special_chars, total_3_words",0.792348,0.015492,0.221376
"total_3_orless_words, total_6_orless_words",0.792315,0.016238,0.168562
"total_upper_chars, total_8_orless_words",0.792266,0.015102,0.16476


Multiplicación de columnas

In [588]:
no_column = get_dic_acc()
index_no_column = []

for x in pair_number_columns:
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    x_train_multi = x_2_train.copy()
    x_train_multi['multi_column'] = x_train_multi[x[0]] * x_train_multi[x[1]]
    
    n_scores = cross_val_score(light_model, x_train_multi, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_multi = pd.DataFrame(no_column, index=index_no_column)


In [589]:
results_multi.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"total_special_chars, ratio_short_big_words",0.792709,0.014989,0.252796
"total_words_location_cero_default, ratio_short_big_words",0.79266,0.014851,0.190612
"stopword_count, text_similarity_keyword",0.792594,0.013323,0.193418
"total_6_words, total_6_ormore_words",0.792578,0.01432,0.187071
"total_numbers_chars, len_location_cero_default",0.792545,0.014523,0.176167
"total_3_orless_words, total_6_orless_words",0.792512,0.014187,0.200776
"total_words, polarity_text",0.792496,0.014928,0.213749
"total_8_orless_words, polarity_text",0.792463,0.014541,0.205782
"total_5_ormore_words, polarity_text",0.792463,0.014418,0.187454
"total_6_ormore_words, polarity_text",0.792447,0.013427,0.187649


Division entre columnas

In [590]:
no_column = get_dic_acc()
index_no_column = []

for x in pair_number_columns:
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    x_train_div = x_2_train.copy()
    x_train_div[x[1]] = x_train_div[x[1]].replace(0,1)
    x_train_div['div_column_1'] = x_train_div[x[0]] / x_train_div[x[1]]
    
    x_train_div = x_2_train.copy()
    x_train_div[x[0]] = x_train_div[x[0]].replace(0,1)
    x_train_div['div_column_2'] = x_train_div[x[1]] / x_train_div[x[0]]

    n_scores = cross_val_score(light_model, x_train_div, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_div = pd.DataFrame(no_column, index=index_no_column)

In [591]:
results_div.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"total_3_ormore_words, total_5_ormore_words",0.793826,0.015274,0.244083
"total_numbers_chars, text_best_similarity_keyword",0.792841,0.014767,0.200186
"total_3_words, total_3_ormore_words",0.792726,0.013544,0.171884
"total_numbers_chars, total_5_orless_words",0.792677,0.01454,0.168573
"total_special_chars, total_5_ormore_words",0.792594,0.01583,0.180655
"len_location_cero_default, text_similarity_keyword",0.792447,0.014824,0.209836
"total_8_orless_words, unique_word_count",0.792397,0.015476,0.183842
"total_3_ormore_words, total_8_orless_words",0.792381,0.014547,0.197747
"total_6_ormore_words, total_words_location_cero_default",0.792365,0.014603,0.199793
"total_numbers_chars, total_8_orless_words",0.792332,0.014951,0.176564


# Pruebas haciendo operaciones boleanas con cada dos columnas (todas las combinaciones)

In [592]:
pair_bool_columns = get_all_two_columns(x_2_train.select_dtypes(include=['bool']).columns)

In [598]:
no_column = get_dic_acc()
index_no_column = []

for x in pair_bool_columns:
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    x_train_bool = x_2_train.copy()
    x_train_bool['and'] = x_train_bool.apply(lambda y: y[x[0]] and y[x[1]],axis=1)
    x_train_bool['or'] = x_train_bool.apply(lambda y: y[x[0]] or y[x[1]],axis=1)
    x_train_bool['xor'] = x_train_bool.apply(lambda y: ((not y[x[0]]) and y[x[1]]) or (y[x[0]] and (not y[x[1]])),axis=1)
    
    n_scores = cross_val_score(light_model, x_train_bool, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(", ".join(x))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_bool = pd.DataFrame(no_column, index=index_no_column)

In [599]:
results_bool.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
"contain_upper_words, text_contain_keyword_similarity",0.792677,0.013925,0.170565
"contain_link, text_contain_word_location",0.792644,0.014578,0.176465
"contain_question, contain_words_100_true",0.792381,0.014781,0.180199
"contain_words_90_false, contain_words_75_true",0.792266,0.013714,0.170793
"text_contain_keyword_similarity, contain_words_100_true",0.792184,0.014861,0.168809
"contain_link, contain_words_90_false",0.792102,0.014546,0.187109
"text_contain_keyword, contain_link",0.792053,0.014067,0.178642
"contain_words_85_false, contain_words_80_true",0.79197,0.014479,0.169068
"contain_upper_words, contain_words_70_true",0.791954,0.01495,0.166927
"contain_words_85_false, contain_words_80_false",0.791954,0.013321,0.170306


# Se añaden los mejores resultados de las operaciones anteriores

In [601]:
#sumas
x_2_train['total_special_chars_plus_text_best_similarity_keyword'] = x_2_train.total_special_chars + x_2_train.text_best_similarity_keyword
x_2_train['total_3_orless_words_plus_total_6_orless_words'] = x_2_train.total_3_orless_words + x_2_train.total_6_orless_words
x_2_train['total_5_ormore_words_plus_text_best_similarity_keyword'] = x_2_train.total_5_ormore_words + x_2_train.text_best_similarity_keyword

#restas
x_2_train['total_numbers_chars_minus_total_special_chars'] = x_2_train.total_numbers_chars - x_2_train.total_special_chars
x_2_train['total_special_chars_minus_total_numbers_chars'] = x_2_train.total_special_chars - x_2_train.total_numbers_chars
x_2_train['total_8_words_minus_text_best_similarity_keyword'] = x_2_train.total_8_words - x_2_train.text_best_similarity_keyword
x_2_train['text_best_similarity_keyword_minus_total_8_words'] = x_2_train.text_best_similarity_keyword - x_2_train.total_8_words

#multiplicación
x_2_train['total_special_chars_multi_ratio_short_big_words'] = x_2_train.total_special_chars * x_2_train.ratio_short_big_words
x_2_train['total_words_location_cero_default_multi_ratio_short_big_words'] = x_2_train.total_words_location_cero_default * x_2_train.ratio_short_big_words
x_2_train['stopword_count_multi_text_similarity_keyword'] = x_2_train.stopword_count * x_2_train.text_similarity_keyword

#división

x_2_train['total_3_ormore_words_div_total_5_ormore_words'] = x_2_train.total_3_ormore_words / x_2_train.total_5_ormore_words.replace(0,1)
x_2_train['total_5_ormore_words_div_total_3_ormore_words'] = x_2_train.total_5_ormore_words / x_2_train.total_3_ormore_words.replace(0,1)
x_2_train['total_numbers_chars_div_text_best_similarity_keyword'] = x_2_train.total_numbers_chars / x_2_train.text_best_similarity_keyword.replace(0,1)
x_2_train['text_best_similarity_keyword_div_total_numbers_chars'] = x_2_train.text_best_similarity_keyword / x_2_train.total_numbers_chars.replace(0,1)

#operaciones boleanas
x_2_train['contain_upper_words_and_text_contain_keyword_similarity'] = x_2_train.apply(lambda y: y.contain_upper_words and y.text_contain_keyword_similarity,axis=1)
x_2_train['contain_upper_words_or_text_contain_keyword_similarity'] = x_2_train.apply(lambda y: y.contain_upper_words or y.text_contain_keyword_similarity,axis=1)
x_2_train['contain_upper_words_xor_text_contain_keyword_similarity'] = x_2_train.apply(lambda y: ((not y.contain_upper_words) and y.text_contain_keyword_similarity) or (y.contain_upper_words and (not y.text_contain_keyword_similarity)),axis=1)
x_2_train['contain_link_and_text_contain_word_location'] = x_2_train.apply(lambda y: y.contain_link and y.text_contain_word_location,axis=1)
x_2_train['contain_link_or_text_contain_word_location'] = x_2_train.apply(lambda y: y.contain_link or y.text_contain_word_location,axis=1)
x_2_train['contain_link_xor_text_contain_word_location'] = x_2_train.apply(lambda y: ((not y.contain_link) and y.text_contain_word_location) or (y.contain_link and (not y.text_contain_word_location)),axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

realizamos unas pruebas

In [602]:
start = time.time()
light_model = LGBMClassifier(random_state=1)
cv = RepeatedStratifiedKFold(n_splits=10, random_state=1)
n_scores = cross_val_score(light_model, x_2_train, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
end = time.time()


print('Accuracy: %.3f (std :%.3f). Time: %.2f' % (np.mean(n_scores), np.std(n_scores),(end-start)/60))

Accuracy: 0.794 (std :0.016). Time: 0.42


# One hot encoding con con listas de palabras más comunes en tweets verdaderos y falsos

In [625]:
no_column = get_dic_acc()
index_no_column = []

word_list_list = [words_100_true, words_100_false, words_90_true, words_90_false, words_85_true, words_85_false, words_80_true, words_80_false, words_75_true, words_75_false, words_70_true, words_70_false]

for word_list in word_list_list:
    
    x_train_one_hot_enc = x_2_train.copy()

    for word in word_list:
        is_Ascii = True
        for c in word:
            if ord(c) > 127 or ord(c) < 0:
                is_Ascii = False
                break
        if not is_Ascii:
            continue
            
        x_train_one_hot_enc[word+'_OHE'] = x_train_one_hot_enc.join(tweets.loc[:,'text']).text.transform(lambda y: word.lower() in y.lower())
    
    start = time.time()
    light_model = LGBMClassifier(random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10,random_state=1)
    
    n_scores = cross_val_score(light_model, x_train_one_hot_enc, y_2_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    end = time.time()
    
    index_no_column.append(word_list_list.index(word_list))
    no_column['accuracy'].append(np.mean(n_scores))
    no_column['std'].append(np.std(n_scores))
    no_column['time'].append((end - start)/60)


results_OHE = pd.DataFrame(no_column, index=index_no_column)

In [626]:
results_OHE.nlargest(10,'accuracy')

Unnamed: 0,accuracy,std,time
10,0.797126,0.015773,0.359531
5,0.795928,0.015288,0.285071
1,0.795764,0.015665,0.296392
3,0.795632,0.014825,0.351458
11,0.795304,0.016501,0.410486
8,0.794959,0.01538,0.325504
9,0.794943,0.015599,0.355581
7,0.794877,0.015206,0.335005
2,0.794647,0.015886,0.304725
0,0.794499,0.015916,0.374553
