In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Para correrlo en colab
#from google.colab import files
#import io
#fin

from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

import time
start = time.time()

In [2]:
test = pd.read_csv("data/test.csv")
ensemble_submit = test['id'].to_frame()

## CNN

In [3]:
from   keras                        import initializers, regularizers, constraints, callbacks, optimizers
from   keras.layers                 import Conv1D, Embedding, GlobalMaxPooling1D, concatenate, Input, Dense,Dropout
from   keras.models                 import Sequential, Model
from   keras.preprocessing.sequence import pad_sequences
from   keras.preprocessing.text     import Tokenizer
from   sklearn.model_selection      import train_test_split
from   gensim.models.keyedvectors   import KeyedVectors

### Preparacion de datos

In [4]:
train_cnn = pd.read_csv('data/train_cnn.csv')
test_cnn = pd.read_csv('data/test_cnn.csv')

In [5]:
test_cnn.text_clean.fillna(" ", inplace=True)
test_it=test_cnn[["id","text_clean"]]
total_words = train_cnn[["id","text_clean"]]
total_words=pd.concat([total_words,test_it], sort = 'False')

In [6]:
numWords=20000
tokenizer = Tokenizer(num_words = numWords)
tokenizer.fit_on_texts(total_words.text_clean)
sequences_train  = tokenizer.texts_to_sequences(train_cnn['text_clean'])
sequences_test  =  tokenizer.texts_to_sequences(test_cnn['text_clean'])
word_index       = tokenizer.word_index

In [7]:
max_length       = 300
x_train          = pad_sequences(sequences_train, maxlen = max_length)
x_test           = pad_sequences(sequences_test, maxlen = max_length)
x_features_train = train_cnn[['len_text', 'count_word_uniques',"text_contain_keyword_norm","contain_question_norm","contain_hashtag_norm"]]
x_features_test  = test_cnn[['len_text', 'count_word_uniques',"text_contain_keyword_norm","contain_question_norm","contain_hashtag_norm"]]
y_train          = train_cnn["target"].values

In [8]:
word_vectors = KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary = True)

In [9]:
embeddind_dimension = 300
vocabulary_size = min(len(word_index) + 1, numWords)
embedding_matrix = np.zeros((vocabulary_size, embeddind_dimension))

In [10]:
for word, i in word_index.items():
    if i < numWords:
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), embeddind_dimension)

del(word_vectors)

In [11]:
e = Embedding(len(embedding_matrix), 200, input_length=300)

In [12]:
Xtrain1, Xval1, ytrain1, yval1 = train_test_split(x_train, y_train,
                                              train_size = 0.95, random_state = 233)

Xfeaturestrain1, Xfeaturesval1, ytrain1, yval1 = train_test_split(x_features_train, y_train,
                                                            train_size = 0.95, random_state = 233)

### Uso de modelos

In [13]:
model = Sequential()
model.add(e)
model.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.25))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 200)          4000000   
_________________________________________________________________
conv1d (Conv1D)              (None, 299, 100)          40100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 256)               25856     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 4,066,213
Trainable params: 4,066,213
Non-trainable params: 0
______________________________________________

In [14]:
model.fit(Xtrain1, ytrain1,
          validation_data=(Xval1, yval1), 
          epochs=5, 
          batch_size=300,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x217627550>

In [15]:
y_pred1 = model.predict_classes(x_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [16]:
model3 = Sequential()
model3.add(e)
model3.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model3.add(GlobalMaxPooling1D())
model3.add(Dense(200, activation='relu'))
model3.add(Dropout(0.25))
model3.add(Dense(200, activation='relu'))
model3.add(Dense(100, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model3.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 200)          4000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 299, 100)          40100     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 200)               20200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_4 (Dense)              (None, 100)              

In [17]:
model3.fit(x = [Xtrain1, Xfeaturestrain1], y = ytrain1, epochs=5,
          validation_data = ([Xval1, Xfeaturesval1], yval1), verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2176a8210>

In [18]:
y_pred3 = model3.predict_classes([x_test, x_features_test])

In [19]:
model4 = Sequential()
model4.add(e)
model4.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model4.add(GlobalMaxPooling1D())
model4.add(Dense(200, activation='relu'))
model4.add(Dropout(0.25))
model4.add(Dense(200, activation='relu'))
model4.add(Dense(100, activation='relu'))
model4.add(Dense(50, activation='relu'))
model4.add(Dropout(0.25))
model4.add(Dense(1, activation='sigmoid'))
model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 200)          4000000   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 299, 100)          40100     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 200)               20200     
_________________________________________________________________
dropout_2 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_8 (Dense)              (None, 100)              

In [20]:
model4.fit(x = [Xtrain1, Xfeaturestrain1], y = ytrain1, epochs=5,
          validation_data = ([Xval1, Xfeaturesval1], yval1), verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x11d8d3890>

In [21]:
y_pred4 = model4.predict_classes([x_test, x_features_test])

In [22]:
submitFinal = test['id'].to_frame()
submitFinal["pred1"] = y_pred1
submitFinal["pred2"] = y_pred3
submitFinal["pred3"] = y_pred4
submitFinal["suma"] = submitFinal.pred1+submitFinal.pred2+submitFinal.pred3
ensemble_submit["target_cnn"] = submitFinal.suma.apply(lambda x: 1 if x>2 else 0)

## Light GBM

In [23]:
train = pd.read_csv("data/train_pre_processing_true.csv")
test = pd.read_csv("data/test_pre_processing_true.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_1 = light_model.predict(x_test)

In [24]:
train = pd.read_csv("data/train_pre_processing_false.csv")
test = pd.read_csv("data/test_pre_processing_false.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
x_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_test.columns]
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_2 = light_model.predict(x_test)

In [25]:
train = pd.read_csv("data/train_pre_processing_true_false.csv")
test = pd.read_csv("data/test_pre_processing_true_false.csv")
x_train, y_train = train.select_dtypes(include=['float64','int64','bool']).iloc[:,:-1],train.iloc[:,-1]
x_train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_train.columns]
x_test = test.drop(columns=['id']).select_dtypes(include=['float64','int64','bool'])
x_test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in x_test.columns]
light_model = LGBMClassifier(random_state=1)
light_model.fit(x_train, y_train)
preds_3 = light_model.predict(x_test)

In [26]:
preds_total = pd.DataFrame({'uno':preds_1,'dos':preds_2,'tres':preds_3})

In [27]:
ensemble_submit['target_lgbm'] = preds_total.apply(lambda x: (x.uno and x.dos) or (x.tres and x.dos) or (x.uno and x.tres),axis=1)

## XGBoost

In [28]:
train = pd.read_csv('data/train_pre_processing_nlp_5000.csv')
test = pd.read_csv('data/test_pre_processing_nlp_5000.csv')
train.drop(labels=['id'], axis=1, inplace=True)
test.drop(labels=['id'], axis=1, inplace=True)

In [29]:
noise_cols = ['location','len_location_cero_default', 
             'total_words_location_cero_default',
             'total_words_location_mean_default', 
              'text']
train.drop(labels=noise_cols, axis=1, inplace=True)
test.drop(labels=noise_cols, axis=1, inplace=True)

In [30]:
def prepare_df_for_fit(df):
    columns_str = ['keyword', 'text_clean', 'keyword_grouped']
    
    # Encode with LabelEncoder
    encoded_cols = df[columns_str]
    encoded_cols = encoded_cols.astype('str')
    encoded_cols = encoded_cols.apply(LabelEncoder().fit_transform)
    encoded_drop = df.drop(columns_str, axis = 1)
    encoded_df = pd.concat([encoded_drop, encoded_cols], axis = 1)
    # Drop Target column
    if 'target' in encoded_df.columns:
        encoded_df.drop(axis=1, labels=['target'], inplace=True)

    return encoded_df

In [31]:
train_X = prepare_df_for_fit(train)
test_X = prepare_df_for_fit(test) 
train_Y = train['target']

In [32]:
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10, colsample_bytree = 0.5, 
                             gamma = 0.1, learning_rate = 0.06, max_depth = 5, min_child_weight = 2, 
                             n_estimators = 300, reg_alpha = 0.1, seed = 123, subsample = 0.9)
xgb_model.fit(train_X, train_Y)
preds = xgb_model.predict(test_X)
ensemble_submit['target_xgb'] = preds



## Result

In [33]:
ensemble_submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   id           3263 non-null   int64
 1   target_cnn   3263 non-null   int64
 2   target_lgbm  3263 non-null   int64
 3   target_xgb   3263 non-null   int64
dtypes: int64(4)
memory usage: 102.1 KB


In [34]:
ensemble_submit["target"] = ensemble_submit.apply(lambda x: (x.target_lgbm and x.target_xgb) or (x.target_lgbm and x.target_cnn) or (x.target_xgb and x.target_cnn),axis=1)

In [35]:
ensemble_submit = ensemble_submit[['id',"target"]]
ensemble_submit.to_csv('submit_xgb_lgbm_cnn_diff_dataset.csv', index=False)

In [36]:
end = time.time()
print(f"Total run time: {round(end - start, 2)}s")

Total run time: 1150.11s
