In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

In [60]:
test = pd.read_csv("data/test.csv")
ensemble_submit = test['id'].to_frame()

## Light GBM

In [61]:
train = pd.read_csv("data/train_pre_processing_true.csv")
test = pd.read_csv("data/test_pre_processing_true.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_1 = light_model.predict(x_test)
ensemble_submit['target_lgbm_pred1'] = preds_1

In [62]:
train = pd.read_csv("data/train_pre_processing_false.csv")
test = pd.read_csv("data/test_pre_processing_false.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
x_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_test.columns]
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_2 = light_model.predict(x_test)
ensemble_submit['target_lgbm_pred2'] = preds_2

In [63]:
train = pd.read_csv("data/train_pre_processing_true_false.csv")
test = pd.read_csv("data/test_pre_processing_true_false.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
x_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_test.columns]
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_3 = light_model.predict(x_test)
ensemble_submit['target_lgbm_pred3'] = preds_3

In [64]:
preds_total = pd.DataFrame({'uno':preds_1,'dos':preds_2,'tres':preds_3})

In [65]:
ensemble_submit['target_lgbm'] = preds_total.apply(lambda x: (x.uno and x.dos) or (x.tres and x.dos) or (x.uno and x.tres),axis=1)

## XGBoost

In [66]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

In [67]:
noise_cols = ['location','len_location_cero_default', 
             'total_words_location_cero_default',
             'total_words_location_mean_default', 
              'text']
train.drop(labels=noise_cols, axis=1, inplace=True)
test.drop(labels=noise_cols, axis=1, inplace=True)

In [68]:
def prepare_df_for_fit(df):
    columns_str = ['keyword', 'text_clean', 'keyword_grouped']
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [71]:
train_X = prepare_df_for_fit(train)
test_X = prepare_df_for_fit(test) 
train_Y = train['target']

In [72]:
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10, colsample_bytree = 0.5, 
                             gamma = 0.1, learning_rate = 0.06, max_depth = 5, min_child_weight = 2, 
                             n_estimators = 300, reg_alpha = 0.1, seed = 123, subsample = 0.9)
xgb_model.fit(train_X, train_Y)
preds = xgb_model.predict(test_X)
ensemble_submit['target_xgb'] = preds



## Result

In [73]:
ensemble_submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   id                 3263 non-null   int64
 1   target_lgbm_pred1  3263 non-null   int64
 2   target_lgbm_pred2  3263 non-null   int64
 3   target_lgbm_pred3  3263 non-null   int64
 4   target_lgbm        3263 non-null   int64
 5   target_xgb         3263 non-null   int64
dtypes: int64(6)
memory usage: 153.1 KB


In [74]:
column_list = list(ensemble_submit.columns)
column_list.remove("id")
#ensemble_submit["sum"] = ensemble_submit[column_list].sum(axis=1)
ensemble_submit["general_sum"] = ensemble_submit[['target_lgbm','target_xgb']].sum(axis=1)
ensemble_submit.head(5)

Unnamed: 0,id,target_lgbm_pred1,target_lgbm_pred2,target_lgbm_pred3,target_lgbm,target_xgb,sum,general_sum
0,0,1,1,1,1,0,4,1
1,2,1,1,1,1,1,5,2
2,3,1,1,1,1,1,5,2
3,9,1,1,1,1,1,5,2
4,11,1,1,1,1,1,5,2


In [75]:
ensemble_submit["target"] = ensemble_submit.general_sum.apply(lambda x: 1 if x >= 2 else 0)

In [76]:
ensemble_submit = ensemble_submit[['id',"target"]]
ensemble_submit.to_csv('submit_xgb_lgbm_diff_dataset.csv', index=False)