In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# metrics
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

#xgb
import xgboost as xgb

from pathlib import Path

In [2]:
tweets = pd.read_csv("data/train_pre_processing.csv")
test = pd.read_csv("data/test_pre_processing.csv")
tweets = tweets.fillna('0')
test = test.fillna('0')

print(test.info())

submit = test['id'].to_frame()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 38 columns):
id                      3263 non-null int64
keyword                 3263 non-null object
location                3263 non-null object
text                    3263 non-null object
keyword_grouped         3263 non-null object
text_contain_keyword    3263 non-null object
total_words             3263 non-null int64
len_text                3263 non-null int64
total_upper_chars       3263 non-null int64
total_numbers_chars     3263 non-null int64
total_special_chars     3263 non-null int64
total_common_chars      3263 non-null int64
contain_question        3263 non-null bool
contain_link            3263 non-null bool
contain_hashtag         3263 non-null bool
contain_upper_words     3263 non-null bool
total_3_words           3263 non-null int64
total_4_words           3263 non-null int64
total_5_words           3263 non-null int64
total_6_words           3263 non-null int64
total_7_word

In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 38 columns):
keyword                 7613 non-null object
location                7613 non-null object
text                    7613 non-null object
keyword_grouped         7613 non-null object
text_contain_keyword    7613 non-null object
total_words             7613 non-null int64
len_text                7613 non-null int64
total_upper_chars       7613 non-null int64
total_numbers_chars     7613 non-null int64
total_special_chars     7613 non-null int64
total_common_chars      7613 non-null int64
contain_question        7613 non-null bool
contain_link            7613 non-null bool
contain_hashtag         7613 non-null bool
contain_upper_words     7613 non-null bool
total_3_words           7613 non-null int64
total_4_words           7613 non-null int64
total_5_words           7613 non-null int64
total_6_words           7613 non-null int64
total_7_words           7613 non-null int64
total_8_word

## Encode: LabelEncoder

In [4]:
encode_columns = ['keyword', 'location', 'text', 'keyword_grouped', 'text_contain_keyword']

encode_tweets = tweets[encode_columns]
encode_tweets = encode_tweets.astype('str')
encode_tweets = encode_tweets.apply(LabelEncoder().fit_transform)
tweets_encode_drop = tweets.drop(encode_columns, axis = 1)
tweets_encode = pd.concat([tweets_encode_drop, encode_tweets], axis = 1)
tweets_encode.drop(axis=1, labels=['target'], inplace=True)

Y = tweets.iloc[:,-1]

In [5]:
seed = 7
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(tweets_encode, Y, test_size=0.20, random_state=42)

In [6]:
xgb_model = xgb.XGBClassifier(objective="reg:linear", random_state=10, learning_rate=0.05, 
                             max_depth= 3, n_estimators=350, min_child_weight=5, gamma=0.3, subsample=0.8, 
                             colsample_bytree=0.5, reg_alpha=0.1, seed=123)

xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.3, learning_rate=0.05,
       max_delta_step=0, max_depth=3, min_child_weight=5, missing=None,
       n_estimators=350, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=10, reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1,
       seed=123, silent=True, subsample=0.8)

In [7]:
preds = xgb_model.predict(X_test)

In [8]:
roc_auc_score(y_test, preds)

0.7250011459277255

In [9]:
acc = accuracy_score(preds,y_test)
print("ACC: %f" % (acc))

ACC: 0.736047


In [10]:
# 0.7276632241822484

## Predecir Kaggle test set

In [11]:
encode_test = test[encode_columns]
encode_test = encode_test.astype('str')
encode_test = encode_test.apply(LabelEncoder().fit_transform)
encode_test_drop = test.drop(encode_columns, axis = 1)
test_encode = pd.concat([encode_test_drop, encode_test], axis = 1)

In [12]:
ret = xgb_model.predict(test_encode.drop(axis=1, labels=['id']))

In [13]:
print(ret)

[0 0 1 ... 1 1 1]


In [14]:
submit['target'] = pd.DataFrame(ret)
submit

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
5,12,0
6,21,1
7,22,0
8,27,0
9,29,0


In [15]:
Path("result").mkdir(parents=True, exist_ok=True)
submit.to_csv('result/submit.csv', index=False)