# Random forest & Boosted trees + location check hipotesis

In this notebook I'm going to verify the hypotesis that detailed (cleaned) locations i.e. country/state have correlation with target classification

In [2]:
import re
import string
import numpy as np
import pandas as pd
import scipy as sp
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing
import xgboost as xg
import validators as vld

In [15]:
df_train = pd.read_csv('./train_enriched.csv')
df_train.fillna({'keyword':"", 'country': "", 'state': "", 'city': "", 'location_phrase': ""}, inplace=True)
df_train.shape

(7613, 20)

In [16]:
df_train.sample(n=5)

Unnamed: 0,id,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_length,upper_text_factor,tags_count,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,target
340,340,armageddon,0.119048,Kent,,,,0,Lee does comedy: ÛÏ@LeeJasper: Working class ...,lee comedy ûïleejasper working class tory pre...,91,0.131868,1,0.054945,1,0,12,0.25,0.75,0
2060,2060,dead,0.233333,Afghanistan,Afghanistan,,,0,17 dead as Afghanistan aircraft crashes: An Af...,dead afghanistan aircraft crash afghan militar...,117,0.051282,0,0.076923,0,1,20,0.3,0.65,1
1741,1741,collision,0.74359,Sacramento,United States,California,Sacramento,0,North Sac Elkhorn Blvd / Walerga Rd **Trfc Col...,north sac elkhorn blvd walerga rd trfc collis...,77,0.155844,0,0.142857,0,1,11,0.0,0.909091,1
1051,1051,body%20bags,0.02439,"California, USA",United States,California,,0,Womens Handbags Cross Body Geometric Pattern S...,woman handbag cross body geometric pattern sat...,124,0.169355,0,0.08871,0,2,15,0.0,0.866667,0
4703,4703,landslide,0.424242,The Circle of Life,,,,0,So when you're caught in a landslide\nI'll be ...,caught landslide ill rain give sunshine ill,95,0.042105,0,0.031579,0,0,24,0.625,0.375,0


## Features selection

In [53]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    'urls_count',
    'stop_words_factor',
    'clean_tokens_factor',
    'positive_factor'
]

vc_text = feature_extraction.text.TfidfVectorizer(max_features=2000)

transformer = compose.ColumnTransformer(transformers=[
    ('text_vector', vc_text, 'clean_text'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

X_train = transformer.fit_transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']
# X_train.columns = X_train.columns.astype(str)

X_train.shape, Y_train.shape

X_test = transformer.transform(pd.read_csv('./test_enriched.csv', index_col='id'))
print('X_test shape', X_test.shape)


X_train shape (7613, 2172)
X_test shape (3263, 2172)


In [18]:
vc_text.get_feature_names_out()

NotFittedError: Vocabulary not fitted or provided

## Model selection

### Random forest

In [25]:
params_rf = {
    'n_estimators': [100, 200],
    'max_depth': [100, None]
}
model_rf = ensemble.RandomForestClassifier(n_jobs=2)
grid_rf = model_selection.GridSearchCV(model_rf, params_rf, cv=5, n_jobs=2, scoring='f1', verbose=3)
grid_rf.fit(X_train, Y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 2/5] END ...max_depth=100, n_estimators=100;, score=0.610 total time=   1.4s
[CV 1/5] END ...max_depth=100, n_estimators=100;, score=0.668 total time=   1.4s
[CV 3/5] END ...max_depth=100, n_estimators=100;, score=0.679 total time=   1.3s
[CV 4/5] END ...max_depth=100, n_estimators=100;, score=0.664 total time=   1.4s
[CV 5/5] END ...max_depth=100, n_estimators=100;, score=0.743 total time=   1.8s
[CV 1/5] END ...max_depth=100, n_estimators=200;, score=0.664 total time=   6.2s
[CV 2/5] END ...max_depth=100, n_estimators=200;, score=0.609 total time=   8.7s
[CV 3/5] END ...max_depth=100, n_estimators=200;, score=0.685 total time=   8.9s
[CV 4/5] END ...max_depth=100, n_estimators=200;, score=0.666 total time=   9.9s
[CV 5/5] END ...max_depth=100, n_estimators=200;, score=0.734 total time=  10.0s
[CV 1/5] END ..max_depth=None, n_estimators=100;, score=0.654 total time=   5.5s
[CV 2/5] END ..max_depth=None, n_estimators=100;,

In [26]:
grid_rf.best_params_, grid_rf.best_score_

({'max_depth': 100, 'n_estimators': 100}, 0.6728790564309495)

In [27]:
np.sum(grid_rf.best_estimator_.feature_importances_ > 0.001)

142

In [28]:
Y_predict = grid_rf.best_estimator_.predict(X_train)
df_wrong_predictions = pd.DataFrame({'target': Y_train.to_numpy(), 'predict': Y_predict, 'keyword': df_train['keyword'], 'location': df_train['location'], 'text': df_train['text']}).query('target != predict')

In [29]:
with pd.option_context('display.max_colwidth', 200):
    print(df_wrong_predictions.query('target==1').sample(n=10))

      target  predict     keyword                    location  \
7390       1        0   windstorm                         NaN   
2965       1        0    drowning                         NaN   
2040       1        0      danger            Atlanta Georgia    
5984       1        0     screams  xiumin's nonexistent solos   
7396       1        0   windstorm                 Webster, TX   
2686       1        0  detonation                         NaN   
4182       1        0      hazard                 Alameda, CA   
3924       1        0       flood                    New York   
5943       1        0    screamed    livin in a plastic world   
4232       1        0   hazardous                         NaN   

                                                                                                                                             text  
7390                                                                      @blakeshelton DON'T be a FART ??in a WINDSTORM.FOLLOW ME ALREA

#### Generate output

In [54]:
Y_test_predict = grid_rf.best_estimator_.predict(X_test)

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv('./b-forest-2-submission.csv', index=False)

### Boosted trees

In [57]:
params_xgb = {
    'learning_rate': [0.2, 0.3, 0.4],
    'n_estimators': [100, 130],
    'max_depth': [50, 70]
}
model_xgb = xg.XGBClassifier()
grid_xgb = model_selection.GridSearchCV(model_xgb, params_xgb, cv=5, n_jobs=-1, scoring='f1', verbose=3)
grid_xgb.fit(X_train, Y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 4/5] END learning_rate=0.2, max_depth=50, n_estimators=100;, score=0.579 total time=  36.5s
[CV 3/5] END learning_rate=0.2, max_depth=50, n_estimators=100;, score=0.625 total time=  39.6s
[CV 1/5] END learning_rate=0.2, max_depth=50, n_estimators=100;, score=0.671 total time=  40.1s
[CV 5/5] END learning_rate=0.2, max_depth=50, n_estimators=100;, score=0.716 total time=  40.4s
[CV 2/5] END learning_rate=0.2, max_depth=50, n_estimators=100;, score=0.518 total time=  40.7s
[CV 2/5] END learning_rate=0.2, max_depth=50, n_estimators=130;, score=0.504 total time=  43.0s
[CV 1/5] END learning_rate=0.2, max_depth=50, n_estimators=130;, score=0.675 total time=  49.0s
[CV 3/5] END learning_rate=0.2, max_depth=50, n_estimators=130;, score=0.618 total time=  49.3s
[CV 4/5] END learning_rate=0.2, max_depth=50, n_estimators=130;, score=0.574 total time=  47.1s
[CV 5/5] END learning_rate=0.2, max_depth=50, n_estimators=130;, score=0.71

In [58]:
results = pd.DataFrame(grid_xgb.cv_results_).sort_values('rank_test_score')[0:5].T
results

Unnamed: 0,4,0,1,5,8
mean_fit_time,35.286061,39.326933,46.869601,43.259378,32.593399
std_fit_time,0.815371,1.520437,2.234828,1.039958,1.429727
mean_score_time,0.112969,0.117994,0.187901,0.157442,0.1134
std_score_time,0.007692,0.007352,0.045118,0.006508,0.006727
param_learning_rate,0.3,0.2,0.2,0.3,0.4
param_max_depth,50,50,50,50,50
param_n_estimators,100,100,130,130,100
params,"{'learning_rate': 0.3, 'max_depth': 50, 'n_est...","{'learning_rate': 0.2, 'max_depth': 50, 'n_est...","{'learning_rate': 0.2, 'max_depth': 50, 'n_est...","{'learning_rate': 0.3, 'max_depth': 50, 'n_est...","{'learning_rate': 0.4, 'max_depth': 50, 'n_est..."
split0_test_score,0.67069,0.670659,0.674658,0.662609,0.670679
split1_test_score,0.521441,0.518135,0.503951,0.503067,0.496014


In [59]:
grid_xgb.best_params_, grid_xgb.best_score_

({'learning_rate': 0.3, 'max_depth': 50, 'n_estimators': 100},
 0.6229705173357942)