In [1]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# XGBoost

* **learning_rate:** tasa de aprendizaje
* **max_depth:** máxima profundidad de cada árbol
* **subsample:** porcentaje de muestras usadas para cada árbol (valor muy bajo, posible underfitting)
* **colsample_bytree:** porcentaje de features usadas para cada árbol (valores muy alto, posible overfitting)
* **n_estimators:** cantidad de árboles a construir.
* **objective:** función de error a utilizar (algunas: reg:squarederror para regresión, reg:logistic o binary:logistic para clasificación)

Parámetros de regularización:

* **gamma:** umbral para hacer split basado en la reducción de error de hacer el nuevo split.
* **alpha:** regularización para los pesos de las hojas. Un valor más alto genera una mayor regularización.
* **lambda:** similar alpha pero para la sintonia fina.

### Dataset del TP2
* Hacemos una transformacion basica solo para probar el XGBoost

In [132]:
train = pd.read_csv('../train.csv')

train['keyword'] = train['keyword'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train['location'] = train['location'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train['text'] = train['text'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train = train[['keyword', 'location', 'text', 'target']]

### Divimos el dataSet en Train y Test

In [133]:
X, y = train.iloc[:,:-1],train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

### Instanciamos XGBoost

In [134]:
xg_reg = xgb.XGBClassifier(eval_metric='auc',n_jobs=4,silent=False)

### Entrenamos

In [135]:
xg_reg.fit(X_train,y_train)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, silent=False, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Predecimos

In [136]:
preds = xg_reg.predict(X_test)

### Calculamos el Error con 'accuracy' como medicion

In [137]:
preds = [round(value) for value in preds]
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 62.84%


### XGBoost Transformacion1 - Agregando feature 'text_contains_keyword'

In [144]:
train = pd.read_csv('../train.csv')

#cantidad rows 7613
def is_in_text(keyword, text):
    is_in = 1
    arrKeyword = keyword.split("_")
    for key in arrKeyword:
        if not(key.upper() in text.upper() and key != ''):
            is_in = 0
            break
    
    return is_in


train['keyword'] = train['keyword'].apply(lambda x: x.replace('%20', '_') if (isinstance(x, str)) else '')
train['text_contains_keyword'] = train.apply(lambda x: is_in_text(x.keyword, x.text), axis=1)
train['keyword'] = train['keyword'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train['text'] = train['text'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)

train = train[['keyword', 'text_contains_keyword', 'text', 'target']]
train

Unnamed: 0,keyword,text_contains_keyword,text,target
0,0,0,69,1
1,0,0,38,1
2,0,0,133,1
3,0,0,65,1
4,0,0,88,1
...,...,...,...,...
7608,0,0,83,1
7609,0,0,125,1
7610,0,0,65,1
7611,0,0,137,1


In [146]:
X, y = train.iloc[:,:-1],train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

xg_regT = xgb.XGBClassifier(eval_metric='auc',n_jobs=4, silent=False)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
preds = [round(value) for value in preds]
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 63.10%


### XGBoost Transformacion2 - Agregando feature 'mean_keyword'
* Cantidad de veces que se repite la keyword / la cantidad total de tweets

In [154]:
train = pd.read_csv('../train.csv')

#cantidad rows 7613
def is_in_text(keyword, text):
    is_in = 1
    arrKeyword = keyword.split("_")
    for key in arrKeyword:
        if not(key.upper() in text.upper() and key != ''):
            is_in = 0
            break
    
    return is_in


train['keyword'] = train['keyword'].apply(lambda x: x.replace('%20', '_') if (isinstance(x, str)) else '')

keywords = train['keyword'].value_counts().to_frame().reset_index()
keywords.columns =['keyword', 'mean_keyword']
train = pd.merge(train, keywords, how='left', on='keyword')

train['mean_keyword'] = train['mean_keyword'].apply(lambda x: x/len(train))
train['text_contains_keyword'] = train.apply(lambda x: is_in_text(x.keyword, x.text), axis=1)
train['keyword'] = train['keyword'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train['text'] = train['text'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)

train = train[['keyword', 'text_contains_keyword', 'mean_keyword', 'text', 'target']]
train

Unnamed: 0,keyword,text_contains_keyword,mean_keyword,text,target
0,0,0,0.008013,69,1
1,0,0,0.008013,38,1
2,0,0,0.008013,133,1
3,0,0,0.008013,65,1
4,0,0,0.008013,88,1
...,...,...,...,...,...
7608,0,0,0.008013,83,1
7609,0,0,0.008013,125,1
7610,0,0,0.008013,65,1
7611,0,0,0.008013,137,1


In [155]:
X, y = train.iloc[:,:-1],train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

xg_regT = xgb.XGBClassifier(eval_metric='auc',n_jobs=4, silent=False)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
preds = [round(value) for value in preds]
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 66.12%


### XGBoost Transformacion3 - Agregando feature 'mean_text_contains_keyword'
* Cantidad de veces que si aparece la keyword en el texto/ la cantidad total que aparece la keyword

In [226]:
train = pd.read_csv('../train.csv')

#cantidad rows 7613
def is_in_text(keyword, text):
    is_in = 1
    arrKeyword = keyword.split("_")
    for key in arrKeyword:
        if not(key.upper() in text.upper() and key != ''):
            is_in = 0
            break
    
    return is_in


train['keyword'] = train['keyword'].apply(lambda x: x.replace('%20', '_') if (isinstance(x, str)) else '')

keywords = train['keyword'].value_counts().to_frame().reset_index()
keywords.columns =['keyword', 'mean_keyword']
train = pd.merge(train, keywords, how='left', on='keyword')

train['mean_keyword'] = train['mean_keyword'].apply(lambda x: x/len(train))
train['text_contains_keyword'] = train.apply(lambda x: is_in_text(x.keyword, x.text), axis=1)

groupbyKeyword = train.groupby(['keyword'])['text_contains_keyword']
text_contains_keyword_sum = groupbyKeyword.sum().reset_index(name='value')
text_contains_keyword_count = groupbyKeyword.count().reset_index(name='value')
mean_text_contains_keyword = pd.DataFrame({'keyword': text_contains_keyword_sum['keyword'], 'mean_text_contains_keyword': text_contains_keyword_sum['value']/text_contains_keyword_count['value']})
train = pd.merge(train, mean_text_contains_keyword, how='left', on='keyword')

train['keyword'] = train['keyword'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)
train['text'] = train['text'].transform(lambda x: len(x) if (isinstance(x, str)) else 0)

train = train[['keyword', 'text_contains_keyword', 'mean_keyword', 'mean_text_contains_keyword', 'text', 'target']]
train

Unnamed: 0,keyword,text_contains_keyword,mean_keyword,mean_text_contains_keyword,text,target
0,0,0,0.008013,0.0,69,1
1,0,0,0.008013,0.0,38,1
2,0,0,0.008013,0.0,133,1
3,0,0,0.008013,0.0,65,1
4,0,0,0.008013,0.0,88,1
...,...,...,...,...,...,...
7608,0,0,0.008013,0.0,83,1
7609,0,0,0.008013,0.0,125,1
7610,0,0,0.008013,0.0,65,1
7611,0,0,0.008013,0.0,137,1


In [227]:
X, y = train.iloc[:,:-1],train.iloc[:,-1]
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

xg_regT = xgb.XGBClassifier(eval_metric='auc',n_jobs=4, silent=False)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
preds = [round(value) for value in preds]
# evaluate predictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 71.04%
