# Random forest & Boosted trees + location check hipotesis

In this notebook I'm going to verify the hypotesis that detailed (cleaned) locations i.e. country/state have correlation with target classification

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import feature_extraction, ensemble, model_selection, pipeline, compose, preprocessing
from sklearn.experimental import enable_halving_search_cv
import xgboost as xg


In [3]:
df_train = pd.read_csv('./train_enriched.csv')
df_train.fillna({'keyword':"", 'country': "", 'state': "", 'city': "", 'location_phrase': ""}, inplace=True)
df_train.shape

(7613, 20)

In [4]:
df_train.sample(n=5)

Unnamed: 0,id,keyword,positive_factor,location,country,state,city,missing_location,text,clean_text,text_length,upper_text_factor,tags_count,punct_factor,ann_count,urls_count,tokens_count,stop_words_factor,clean_tokens_factor,target
1018,1018,body%20bags,0.02439,"Austin, Texas",United States,Texas,Austin,0,@FoxNews @JenGriffinFNC When you call to repor...,foxnews jengriffinfnc call report dangerous ac...,88,0.090909,0,0.034091,2,0,17,0.294118,0.705882,1
6343,6343,structural%20failure,0.657143,Indonesia,Indonesia,,,0,Investigators have said a Virgin Galactic spac...,investigator said virgin galactic spaceship cr...,119,0.07563,0,0.07563,0,1,18,0.333333,0.611111,1
5428,5428,police,0.567568,"Kansas City, Mo.",United States,Missouri,Kansas City,0,Police: Gunman reported dead at Nashville area...,police gunman reported dead nashville area the...,116,0.077586,0,0.086207,0,1,21,0.333333,0.619048,1
2991,2991,dust%20storm,0.666667,Idaho,,Idaho,,0,@NWSPocatello BG-16: So far brunt of storm jus...,nwspocatello bg16 far brunt storm north grayed...,109,0.091743,0,0.082569,1,0,26,0.269231,0.730769,0
693,693,blazing,0.029412,Lima-Peru,,,,0,Oh my heart racing And my temperature is blazi...,oh heart racing temperature blazing roof video...,81,0.111111,1,0.012346,0,0,15,0.333333,0.666667,0


## Features selection

In [5]:
categorical_features = [
    'country',
    'state',
]
numerical_features = [
    'text_length', 
    'urls_count',
    'stop_words_factor',
    'clean_tokens_factor',
    'positive_factor'
]

vc_text = feature_extraction.text.TfidfVectorizer(max_features=2000)

transformer = compose.ColumnTransformer(transformers=[
    ('text_vector', vc_text, 'clean_text'),
    ('one_hot', preprocessing.OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('numerical', preprocessing.StandardScaler(), numerical_features)
], remainder='drop')

X_train = transformer.fit_transform(df_train)
print('X_train shape', X_train.shape)

Y_train = df_train['target']
# X_train.columns = X_train.columns.astype(str)

X_train.shape, Y_train.shape

X_test = transformer.transform(pd.read_csv('./test_enriched.csv', index_col='id'))
print('X_test shape', X_test.shape)


X_train shape (7613, 2172)
X_test shape (3263, 2172)


In [6]:
vc_text.get_feature_names_out()

NotFittedError: Vocabulary not fitted or provided

## Model selection

### Random forest

In [7]:
params_rf = {
    'n_estimators': [100, 200],
    'max_depth': [100, 200],
    'ccp_alpha': np.geomspace(0.005, 0.00001, 10)
}
model_rf = ensemble.RandomForestClassifier(n_jobs=-1)


In [8]:
halving_rf = model_selection.HalvingGridSearchCV(estimator=model_rf, param_grid=params_rf, cv=5, n_jobs=2, scoring='f1', verbose=3)
halving_rf.fit(X_train, Y_train)
print('Best params', halving_rf.best_params_, halving_rf.best_score_)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 281
max_resources_: 7613
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 40
n_resources: 281
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[CV 1/5] END ccp_alpha=0.005, max_depth=100, n_estimators=100;, score=(train=0.948, test=0.214) total time=   0.4s
[CV 2/5] END ccp_alpha=0.005, max_depth=100, n_estimators=100;, score=(train=0.925, test=0.529) total time=   0.4s
[CV 4/5] END ccp_alpha=0.005, max_depth=100, n_estimators=100;, score=(train=0.968, test=0.667) total time=   0.4s
[CV 3/5] END ccp_alpha=0.005, max_depth=100, n_estimators=100;, score=(train=0.962, test=0.600) total time=   0.4s
[CV 5/5] END ccp_alpha=0.005, max_depth=100, n_estimators=100;, score=(train=0.980, test=0.679) total time=   0.3s
[CV 1/5] END ccp_alpha=0.005, max_depth=100, n_estimators=200;, score=(train=0.960, test=0.214) total time=   0.7s
[CV 2/5] END ccp_alpha=0.005, max_depth=100, n_estimators=200;, score=(train=0.951, test=0.438) total time=   0.7s
[CV 3/5] END ccp_alpha=0.005, max_depth=100, n_estimators=200;, score=(train=0.973, test=0.526) total time=   0.7s
[CV 4/5] END ccp_alpha=0.005, max_depth=100, n_estimators=200;, score=(train=0.9

In [11]:
Y_predict = halving_rf.best_estimator_.predict(X_train)
df_wrong_predictions = pd.DataFrame({'target': Y_train.to_numpy(), 'predict': Y_predict, 'keyword': df_train['keyword'], 'location': df_train['location'], 'text': df_train['text']}).query('target != predict')


In [13]:
with pd.option_context('display.max_colwidth', 200):
    print(df_wrong_predictions.query('target==1').sample(n=10))

      target  predict         keyword                   location  \
4171       1        0            harm                         va   
3134       1        0       emergency                        NaN   
548        1        0          battle                        NaN   
2554       1        0         destroy            Jerseyville, IL   
6831       1        0         trapped                Puerto Rico   
2342       1        0      demolition                        NaN   
988        1        0  body%20bagging  ÌÏT: 39.982988,-75.261624   
4770       1        0       lightning                        NaN   
7091       1        0        upheaval                Connecticut   
4311       1        0        hellfire           Denver, Colorado   

                                                                                                                                                 text  
4171                     @malistkiss Sunnis continue to believe they are more righteous and they co

#### Generate output

In [14]:
Y_test_predict = halving_rf.best_estimator_.predict(X_test)

df_example = pd.read_csv('./sample_submission.csv')
df_example['target'] = Y_test_predict

df_example.to_csv('./b-forest-3-submission.csv', index=False)

### Boosted trees

In [15]:
params_xgb = {
    'learning_rate': np.geomspace(0.01, 0.3, 5),
    'n_estimators': [100, 200],
    'max_depth': [50, 70]
}
model_xgb = xg.XGBClassifier()
xgb_selection = model_selection.HalvingGridSearchCV(model_xgb, params_xgb, cv=5, n_jobs=-1, scoring='f1', verbose=3)
xgb_selection.fit(X_train, Y_train)
print('XGB best model', xgb_selection.best_params_, xgb_selection.best_score_)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 845
max_resources_: 7613
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 20
n_resources: 845
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 4/5] END learning_rate=0.01, max_depth=50, n_estimators=100;, score=(train=0.908, test=0.619) total time=   0.5s
[CV 2/5] END learning_rate=0.01, max_depth=50, n_estimators=100;, score=(train=0.912, test=0.579) total time=   0.6s
[CV 1/5] END learning_rate=0.01, max_depth=50, n_estimators=100;, score=(train=0.897, test=0.574) total time=   0.6s
[CV 5/5] END learning_rate=0.01, max_depth=50, n_estimators=100;, score=(train=0.883, test=0.730) total time=   0.7s
[CV 3/5] END learning_rate=0.01, max_depth=50, n_estimators=100;, score=(train=0.901, test=0.615) total time=   0.7s
[CV 1/5] END learning_rate=0.01, max_depth=50, n_estimators=200;, score=(train=0.930, test=0.589) total time=   1.1s
[CV 1/5] END learning_rate=0.01, max

In [58]:
results = pd.DataFrame(xgb_selection.cv_results_).sort_values('rank_test_score')[0:5].T
results

Unnamed: 0,4,0,1,5,8
mean_fit_time,35.286061,39.326933,46.869601,43.259378,32.593399
std_fit_time,0.815371,1.520437,2.234828,1.039958,1.429727
mean_score_time,0.112969,0.117994,0.187901,0.157442,0.1134
std_score_time,0.007692,0.007352,0.045118,0.006508,0.006727
param_learning_rate,0.3,0.2,0.2,0.3,0.4
param_max_depth,50,50,50,50,50
param_n_estimators,100,100,130,130,100
params,"{'learning_rate': 0.3, 'max_depth': 50, 'n_est...","{'learning_rate': 0.2, 'max_depth': 50, 'n_est...","{'learning_rate': 0.2, 'max_depth': 50, 'n_est...","{'learning_rate': 0.3, 'max_depth': 50, 'n_est...","{'learning_rate': 0.4, 'max_depth': 50, 'n_est..."
split0_test_score,0.67069,0.670659,0.674658,0.662609,0.670679
split1_test_score,0.521441,0.518135,0.503951,0.503067,0.496014


({'learning_rate': 0.3, 'max_depth': 50, 'n_estimators': 100},
 0.6229705173357942)