<h1><b>[VIII - XGBOOST_13]</b></h1>

XGBoost, with :

- learning_rate=0.01,  
- colsample_bytree = 0.3,
- subsample = 1.0,
- n_estimators=2550, 
- min_child_weight=29,
- max_depth=13, 
- gamma=0.003177450944562012,
- reg_lambda=0.3736917401344113, #L1 
- reg_alpha=0.01345834681286108, #L2

The result will be save in `data/XGBOOST_13.csv` and the probabilities in `data/PROBS_XGBOOST_13.npy`

Note that the original `data/PROBS_XGBOOST_13.npy` is already available.

_________________

# 1) Initialisation

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split

In [3]:
train_df = pd.read_csv("data/train_df_FINAL.csv")
test_df = pd.read_csv("data/test_df_FINAL.csv")

We delete weird data

In [4]:
train_df = train_df[(train_df.index < 195453) | (train_df.index > 197302)]

We replace nan values by the most frequent value for each class

In [5]:
change_types = train_df["change_type"].unique()
for change_type in change_types:
    for column in train_df.columns:
        if column == "geometry":
            continue
        train_df.loc[(train_df["change_type"]==change_type) & (train_df[column].isnull()), column] = train_df.loc[(train_df["change_type"]==change_type), column].mode()[0]

In [7]:
for column, typ in zip(train_df.columns, train_df.dtypes):
    if typ in ['object', 'datetime64[ns]', 'geometry']:
        continue
    print('\'', end="")
    print(column, end="\', ")

'change_type', 'index', 'urban_type_N,A', 'urban_type_UrbanSlum', 'urban_type_Rural', 'urban_type_DenseUrban', 'urban_type_Industrial', 'urban_type_SparseUrban', 'urban_type_Dense Urban,Industrial', 'urban_type_Dense Urban,Urban Slum', 'urban_type_Sparse Urban,Industrial', 'urban_type_Sparse Urban,Urban Slum', 'geography_type_N,A', 'geography_type_Desert', 'geography_type_Farms', 'geography_type_DenseForest', 'geography_type_Hills', 'geography_type_River', 'geography_type_GrassLand', 'geography_type_Snow', 'geography_type_Lakes', 'geography_type_BarrenLand', 'geography_type_Coastal', 'geography_type_SparseForest', 'geography_type_Sparse Forest,Grass Land', 'geography_type_Sparse Forest,Farms', 'geography_type_Sparse Forest,Dense Forest,Grass Land', 'geography_type_Barren Land,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest', 'geography_type_Sparse Forest,Grass Land,Lakes', 'geography_type_Sparse Forest,Farms,Lakes', 'geography_type_River,Sparse Forest,Grass Land', 'geography

In [8]:
feature_names=[ 'urban_type_N,A', 'urban_type_UrbanSlum', 'urban_type_Rural', 'urban_type_DenseUrban', 'urban_type_Industrial', 'urban_type_SparseUrban', 
                #'urban_type_Dense Urban,Industrial', 'urban_type_Dense Urban,Urban Slum', 'urban_type_Sparse Urban,Industrial', 'urban_type_Sparse Urban,Urban Slum', 
               
                'geography_type_N,A', 'geography_type_Desert', 'geography_type_Farms', 'geography_type_DenseForest', 'geography_type_Hills', 'geography_type_River', 'geography_type_GrassLand', 'geography_type_Snow', 'geography_type_Lakes', 'geography_type_BarrenLand', 'geography_type_Coastal', 'geography_type_SparseForest', 
                #'geography_type_Sparse Forest,Grass Land', 'geography_type_Sparse Forest,Farms', 'geography_type_Sparse Forest,Dense Forest,Grass Land', 'geography_type_Barren Land,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest', 'geography_type_Sparse Forest,Grass Land,Lakes', 'geography_type_Sparse Forest,Farms,Lakes', 'geography_type_River,Sparse Forest,Grass Land', 'geography_type_River,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest,Grass Land,Lakes', 'geography_type_Barren Land,Sparse Forest,Grass Land', 'geography_type_Dense Forest,Grass Land', 'geography_type_River,Sparse Forest,Farms', 
               
                #'geourban_geography_type_Desert_urban_type_Industrial', 'geourban_geography_type_Farms_urban_type_Rural', 'geourban_geography_type_Farms_urban_type_DenseUrban', 'geourban_geography_type_Farms_urban_type_Industrial', 'geourban_geography_type_Farms_urban_type_SparseUrban', 'geourban_geography_type_DenseForest_urban_type_UrbanSlum', 'geourban_geography_type_DenseForest_urban_type_Rural', 'geourban_geography_type_DenseForest_urban_type_DenseUrban', 'geourban_geography_type_DenseForest_urban_type_Industrial', 'geourban_geography_type_DenseForest_urban_type_SparseUrban', 'geourban_geography_type_River_urban_type_Rural', 'geourban_geography_type_River_urban_type_DenseUrban', 'geourban_geography_type_River_urban_type_Industrial', 'geourban_geography_type_River_urban_type_SparseUrban', 'geourban_geography_type_GrassLand_urban_type_Rural', 'geourban_geography_type_GrassLand_urban_type_DenseUrban', 'geourban_geography_type_GrassLand_urban_type_Industrial', 'geourban_geography_type_GrassLand_urban_type_SparseUrban', 'geourban_geography_type_Lakes_urban_type_Rural', 'geourban_geography_type_Lakes_urban_type_DenseUrban', 'geourban_geography_type_Lakes_urban_type_Industrial', 'geourban_geography_type_Lakes_urban_type_SparseUrban', 'geourban_geography_type_BarrenLand_urban_type_Rural', 'geourban_geography_type_BarrenLand_urban_type_DenseUrban', 'geourban_geography_type_BarrenLand_urban_type_Industrial', 'geourban_geography_type_BarrenLand_urban_type_SparseUrban', 'geourban_geography_type_Coastal_urban_type_SparseUrban', 'geourban_geography_type_SparseForest_urban_type_UrbanSlum', 'geourban_geography_type_SparseForest_urban_type_Rural', 'geourban_geography_type_SparseForest_urban_type_DenseUrban', 'geourban_geography_type_SparseForest_urban_type_Industrial', 'geourban_geography_type_SparseForest_urban_type_SparseUrban', 
               
                'img_red_mean_date0', 'img_red_mean_date1', 'img_red_mean_date2', 'img_red_mean_date3', 'img_red_mean_date4', 
                'img_green_mean_date0', 'img_green_mean_date1', 'img_green_mean_date2', 'img_green_mean_date3', 'img_green_mean_date4', 
                'img_blue_mean_date0', 'img_blue_mean_date1', 'img_blue_mean_date2', 'img_blue_mean_date3', 'img_blue_mean_date4', 
               
                'img_red_std_date0', 'img_red_std_date1', 'img_red_std_date2', 'img_red_std_date3', 'img_red_std_date4', 
                'img_green_std_date0', 'img_green_std_date1', 'img_green_std_date2', 'img_green_std_date3', 'img_green_std_date4', 
                'img_blue_std_date0', 'img_blue_std_date1', 'img_blue_std_date2', 'img_blue_std_date3', 'img_blue_std_date4', 
               
                'change_status_date0_Construction Done', 'change_status_date0_Construction Midway', 'change_status_date0_Construction Started', 'change_status_date0_Excavation', 'change_status_date0_Greenland', 'change_status_date0_Land Cleared', 'change_status_date0_Materials Dumped', 'change_status_date0_Materials Introduced', 'change_status_date0_Operational', 'change_status_date0_Prior Construction', 
                'change_status_date1_Construction Done', 'change_status_date1_Construction Midway', 'change_status_date1_Construction Started', 'change_status_date1_Excavation', 'change_status_date1_Greenland', 'change_status_date1_Land Cleared', 'change_status_date1_Materials Dumped', 'change_status_date1_Materials Introduced', 'change_status_date1_Operational', 'change_status_date1_Prior Construction', 
                'change_status_date2_Construction Done', 'change_status_date2_Construction Midway', 'change_status_date2_Construction Started', 'change_status_date2_Excavation', 'change_status_date2_Greenland', 'change_status_date2_Land Cleared', 'change_status_date2_Materials Dumped', 'change_status_date2_Materials Introduced', 'change_status_date2_Operational', 'change_status_date2_Prior Construction', 
                'change_status_date3_Construction Done', 'change_status_date3_Construction Midway', 'change_status_date3_Construction Started', 'change_status_date3_Excavation', 'change_status_date3_Greenland', 'change_status_date3_Land Cleared', 'change_status_date3_Materials Dumped', 'change_status_date3_Materials Introduced', 'change_status_date3_Operational', 'change_status_date3_Prior Construction', 
                'change_status_date4_Construction Done', 'change_status_date4_Construction Midway', 'change_status_date4_Construction Started', 'change_status_date4_Excavation', 'change_status_date4_Greenland', 'change_status_date4_Land Cleared', 'change_status_date4_Materials Dumped', 'change_status_date4_Materials Introduced', 'change_status_date4_Operational', 'change_status_date4_Prior Construction', 
               
                'change_status_Prior Construction', 'change_status_Greenland', 'change_status_Land Cleared', 'change_status_Excavation', 'change_status_Materials Dumped', 'change_status_Materials Introduced', 'change_status_Construction Started', 'change_status_Construction Midway', 'change_status_Construction Done', 'change_status_Operational', 
                'change_status_date0_encoded', 'change_status_date1_encoded', 'change_status_date2_encoded', 'change_status_date3_encoded', 'change_status_date4_encoded', 
                'change_status_date_4-0', 'change_status_date_1-0', 'change_status_date_2-1', 'change_status_date_3-2', 'change_status_date_4-3', 
               
                'img_mean_date0', 'img_std_date0', 'img_mean_date1', 'img_std_date1', 'img_mean_date2', 'img_std_date2', 'img_mean_date3', 'img_std_date3', 'img_mean_date4', 'img_std_date4', 
                'img_mean_date_4-0', 'img_mean_date_1-0', 'img_mean_date_2-1', 'img_mean_date_3-2', 'img_mean_date_4-3', 'img_std_date_4-0', 'img_std_date_1-0', 'img_std_date_2-1', 'img_std_date_3-2', 'img_std_date_4-3', 
               
                'img_red_mean_date_4-0', 'img_red_mean_date_1-0', 'img_red_mean_date_2-1', 'img_red_mean_date_3-2', 'img_red_mean_date_4-3', 'img_green_mean_date_4-0', 'img_green_mean_date_1-0', 'img_green_mean_date_2-1', 'img_green_mean_date_3-2', 'img_green_mean_date_4-3', 'img_blue_mean_date_4-0', 'img_blue_mean_date_1-0', 'img_blue_mean_date_2-1', 'img_blue_mean_date_3-2', 'img_blue_mean_date_4-3', 'img_red_std_date_4-0', 'img_red_std_date_1-0', 'img_red_std_date_2-1', 'img_red_std_date_3-2', 'img_red_std_date_4-3', 'img_green_std_date_4-0', 'img_green_std_date_1-0', 'img_green_std_date_2-1', 'img_green_std_date_3-2', 'img_green_std_date_4-3', 'img_blue_std_date_4-0', 'img_blue_std_date_1-0', 'img_blue_std_date_2-1', 'img_blue_std_date_3-2', 'img_blue_std_date_4-3', 
                
                'date_1-0', 'date_2-1', 'date_3-2', 'date_4-3', 'date_4-0', 'date_0-0min', 
                'img_mean_date_4-0/Date', 'img_mean_date_1-0/Date', 'img_mean_date_2-1/Date', 'img_mean_date_3-2/Date', 'img_mean_date_4-3/Date', 'img_std_date_4-0/Date', 'img_std_date_1-0/Date', 'img_std_date_2-1/Date', 'img_std_date_3-2/Date', 'img_std_date_4-3/Date', 'img_red_mean_date_4-0/Date', 'img_red_mean_date_1-0/Date', 'img_red_mean_date_2-1/Date', 'img_red_mean_date_3-2/Date', 'img_red_mean_date_4-3/Date', 'img_green_mean_date_4-0/Date', 'img_green_mean_date_1-0/Date', 'img_green_mean_date_2-1/Date', 'img_green_mean_date_3-2/Date', 'img_green_mean_date_4-3/Date', 'img_blue_mean_date_4-0/Date', 'img_blue_mean_date_1-0/Date', 'img_blue_mean_date_2-1/Date', 'img_blue_mean_date_3-2/Date', 'img_blue_mean_date_4-3/Date', 'img_red_std_date_4-0/Date', 'img_red_std_date_1-0/Date', 'img_red_std_date_2-1/Date', 'img_red_std_date_3-2/Date', 'img_red_std_date_4-3/Date', 'img_green_std_date_4-0/Date', 'img_green_std_date_1-0/Date', 'img_green_std_date_2-1/Date', 'img_green_std_date_3-2/Date', 'img_green_std_date_4-3/Date', 'img_blue_std_date_4-0/Date', 'img_blue_std_date_1-0/Date', 'img_blue_std_date_2-1/Date', 'img_blue_std_date_3-2/Date', 'img_blue_std_date_4-3/Date', 
                'change_status_date_4-0/Date', 'change_status_date_1-0/Date', 'change_status_date_2-1/Date', 'change_status_date_3-2/Date', 'change_status_date_4-3/Date', 
               
                'geometry_area', 'geometry_perimeter', 'geometry_inscribed_circle_radius', 'geometry_compactness', 'geometry_convexity', 'geometry_vertices', 'geometry_bboxwidth', 'geometry_bboxheight', 'geometry_bboxratio', 'geometry_bboxarea', 'geometry_bboxperimeter', 
                #'geometry_aspectratio', 
                'geometry_diameter', 'geometry_minumum_bounding_circle', 'geometry_minimum_rotated_rectangle', 
               
                'neighb_density', 'neighb_nb', 'neighb_similar', 
                'neighb_maxdiff_change_status_date0_0.001', 'neighb_maxdiff_change_status_date1_0.001', 'neighb_maxdiff_change_status_date2_0.001', 'neighb_maxdiff_change_status_date3_0.001', 'neighb_maxdiff_change_status_date4_0.001', 
                'neighb_maxdiff_change_status_date0_0.003', 'neighb_maxdiff_change_status_date1_0.003', 'neighb_maxdiff_change_status_date2_0.003', 'neighb_maxdiff_change_status_date3_0.003', 'neighb_maxdiff_change_status_date4_0.003', 
                'neighb_maxdiff_meancolor_0', 'neighb_maxdiff_meancolor_1', 'neighb_maxdiff_meancolor_2', 'neighb_maxdiff_meancolor_3', 'neighb_maxdiff_meancolor_4', 
                'neighb_maxdiff_stdcolor_0', 'neighb_maxdiff_stdcolor_1', 'neighb_maxdiff_stdcolor_2', 'neighb_maxdiff_stdcolor_3', 'neighb_maxdiff_stdcolor_4', 
               
                'neighb_meandiff_changestatus_0', 'neighb_meandiff_changestatus_1', 'neighb_meandiff_changestatus_2', 'neighb_meandiff_changestatus_3', 'neighb_meandiff_changestatus_4', 
                'neighb_meandiff_meancolor_0', 'neighb_meandiff_meancolor_1', 'neighb_meandiff_meancolor_2', 'neighb_meandiff_meancolor_3', 'neighb_meandiff_meancolor_4', 
                'neighb_meandiff_stdcolor_0', 'neighb_meandiff_stdcolor_1', 'neighb_meandiff_stdcolor_2', 'neighb_meandiff_stdcolor_3', 'neighb_meandiff_stdcolor_4'
            ]

In [9]:
X = train_df[feature_names].to_numpy()
Y = train_df['change_type'].to_numpy()
X_test = test_df[feature_names].to_numpy()

In [10]:
print(X.shape, Y.shape, X_test.shape)

(294296, 271) (294296,) (120526, 271)


We impute nan values for test data

In [11]:

from sklearn.impute import SimpleImputer

# Handle NaN values using SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mean.fit(np.concatenate([X, X_test], axis=0))
X_test = imp_mean.transform(X_test)


Train test split, 20% validation

In [12]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=65, stratify=Y)

In [13]:
np.unique(Y_train, return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([ 25141,  11424, 117708,  79988,   1058,    117]))

# 2) Oversample

In [14]:
"""
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42, sampling_strategy={2:70000})
X_under, Y_under = rus.fit_resample(X_train, Y_train)
np.unique(Y_under, return_counts=True)

## TEST FINALL
#X_under, Y_under = X, Y


categorical_columns = []
for iColumn in range(len(X_train[0])):
    if len(np.unique(X_train[:,iColumn])) < 300:
        #print(feature_names[iColumn], len(np.unique(X_train[:,iColumn])))
        categorical_columns.append(iColumn)
#categorical_columns

from imblearn.over_sampling import SMOTE, SMOTENC

#sm = SMOTE(random_state=42, sampling_strategy={5:30000})
sm = SMOTENC(random_state=42, sampling_strategy={4:10000, 5:10000}, categorical_features=categorical_columns)
X_over, Y_over = sm.fit_resample(X_under, Y_under)
#X_over, Y_over = sm.fit_resample(X, Y)

#X_over, Y_over = X_under, Y_under

#from imblearn.over_sampling import RandomOverSampler
#ros = RandomOverSampler(random_state=0, sampling_strategy={5:50000})

#X_over, Y_over = ros.fit_resample(X_under, Y_under)

#pd.DataFrame(X_under[Y_under == 5][119:]).head(n=5)
#pd.DataFrame(X_over[Y_over == 5][119:]).head(n=5)

"""

'\nfrom imblearn.under_sampling import RandomUnderSampler\n\nrus = RandomUnderSampler(random_state=42, sampling_strategy={2:70000})\nX_under, Y_under = rus.fit_resample(X_train, Y_train)\nnp.unique(Y_under, return_counts=True)\n\n## TEST FINALL\n#X_under, Y_under = X, Y\n\n\ncategorical_columns = []\nfor iColumn in range(len(X_train[0])):\n    if len(np.unique(X_train[:,iColumn])) < 300:\n        #print(feature_names[iColumn], len(np.unique(X_train[:,iColumn])))\n        categorical_columns.append(iColumn)\n#categorical_columns\n\nfrom imblearn.over_sampling import SMOTE, SMOTENC\n\n#sm = SMOTE(random_state=42, sampling_strategy={5:30000})\nsm = SMOTENC(random_state=42, sampling_strategy={4:10000, 5:10000}, categorical_features=categorical_columns)\nX_over, Y_over = sm.fit_resample(X_under, Y_under)\n#X_over, Y_over = sm.fit_resample(X, Y)\n\n#X_over, Y_over = X_under, Y_under\n\n#from imblearn.over_sampling import RandomOverSampler\n#ros = RandomOverSampler(random_state=0, sampling_st

In [15]:
X_over, Y_over = X_train, Y_train

In [16]:
np.unique(Y_over, return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([ 25141,  11424, 117708,  79988,   1058,    117]))

In [17]:
"""
classes_weights = compute_sample_weight(class_weight="balanced", y=Y_over)
np.unique(classes_weights)
"""

'\nclasses_weights = compute_sample_weight(class_weight="balanced", y=Y_over)\nnp.unique(classes_weights)\n'

# 3) Training

In [18]:
def printScores(y_true, y_predict):
    f1_score_micro = f1_score(y_true, y_predict, average='micro')
    f1_score_macro = f1_score(y_true, y_predict, average='macro')
    f1_score_weighted = f1_score(y_true, y_predict, average='weighted')
    train_conf_matrix = confusion_matrix(y_true, y_predict)
    print("F1 score micro", f1_score_micro)
    print("F1 score macro", f1_score_macro)
    print("F1 score weighted", f1_score_weighted)
    print(train_conf_matrix)

In [19]:
model = xgb.XGBClassifier(silent=False, 
                      num_class=6,
                      objective='multi:softmax', 
                      seed=42,
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.3,
                      subsample = 1.0,
                      n_estimators=2550, 
                      #reg_alpha = 0.3,
                      min_child_weight=29,
                      max_depth=13, 
                      gamma=0.003177450944562012,
                      n_jobs=-1,
                      reg_lambda=0.3736917401344113, #L1 
                      reg_alpha=0.01345834681286108, #L2
                      tree_method='gpu_hist',
                      random_state=2020
                    )

In [20]:
"""
eval_set = [(X_over, Y_over), (X_valid, Y_valid)]
eval_metric = ["merror", "auc"]
#model.fit(X_over, Y_over, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=20, verbose=True)#,sample_weight=compute_sample_weight(class_weight="balanced", y=Y_over))

model.fit(X_over, Y_over, eval_set=eval_set, early_stopping_rounds=20, verbose=True)#,sample_weight=compute_sample_weight(class_weight="balanced", y=Y_over))
"""
model.fit(X, Y, verbose=True)

Plotting curves

In [21]:
"""
results = model.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)

fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['auc'], label='Train')
ax.plot(x_axis, results['validation_1']['auc'], label='Test')
ax.legend()
plt.ylabel('AUC')
plt.title('XGBoost AUC')
plt.show()
"""

"\nresults = model.evals_result()\nepochs = len(results['validation_0']['merror'])\nx_axis = range(0, epochs)\n\nfig, ax = plt.subplots()\nax.plot(x_axis, results['validation_0']['auc'], label='Train')\nax.plot(x_axis, results['validation_1']['auc'], label='Test')\nax.legend()\nplt.ylabel('AUC')\nplt.title('XGBoost AUC')\nplt.show()\n"

In [22]:
model.save_model("model.json")

Training score

In [23]:
"""
pred = model.predict(X_over)
printScores(Y_over, pred)
"""

'\npred = model.predict(X_over)\nprintScores(Y_over, pred)\n'

Validation score

In [24]:

pred = model.predict(X_valid)
printScores(Y_valid, pred)


F1 score micro 0.9444274549779137
F1 score macro 0.7463281089790873
F1 score weighted 0.9440369316273594
[[ 6180     3    41    61     0     0]
 [   10  2695    56    95     0     0]
 [  615    26 28048   738     0     0]
 [  243    17  1213 18524     0     0]
 [    2     1    31    89   142     0]
 [    4     2     8    16     0     0]]


Feature importance

In [29]:
"""
from xgboost import plot_importance

model.get_booster().feature_names = feature_names

fig, ax = plt.subplots(1,1,figsize=(25,35))
plot_importance(model.get_booster(), ax=ax)
plt.show()
"""

'\nfrom xgboost import plot_importance\n\nmodel.get_booster().feature_names = feature_names\n\nfig, ax = plt.subplots(1,1,figsize=(25,35))\nplot_importance(model.get_booster(), ax=ax)\nplt.show()\n'

# 4) Optuna

In [25]:
import optuna

In [26]:
"""
def objective(trial):
    
    param = {
        'num_class':6,
        'objective':'multi:softmax', 
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.05]),
        'n_estimators': 1500,
        'max_depth': trial.suggest_categorical('max_depth', [13]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),
    }
    model = xgb.XGBClassifier(**param)
    #xgb.XGBRegressor(**param)  
    
    model.fit(X_train,Y_train,eval_set=[(X_train, Y_train), (X_valid, Y_valid)],early_stopping_rounds=20,verbose=False)
    
    preds = model.predict(X_valid)
    
    score = f1_score(Y_valid, preds, average='weighted')
    
    return score"""

"\ndef objective(trial):\n    \n    param = {\n        'num_class':6,\n        'objective':'multi:softmax', \n        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process\n        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),\n        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),\n        'gamma': trial.suggest_loguniform('gamma', 1e-3, 10.0),\n        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),\n        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]),\n        'learning_rate': trial.suggest_categorical('learning_rate', [0.05]),\n        'n_estimators': 1500,\n        'max_depth': trial.suggest_categorical('max_depth', [13]),\n        'random_state': trial.suggest_categorical('random_state', [2020]),\n        'min_child_weight': trial.suggest_int('min_child_weight', 1, 30),\n    }\n    model = xgb.XGBClass

In [27]:
"""
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=90)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)"""

"\nstudy = optuna.create_study(direction='maximize')\nstudy.optimize(objective, n_trials=90)\nprint('Number of finished trials:', len(study.trials))\nprint('Best trial:', study.best_trial.params)"

In [28]:
#study.trials_dataframe()

# 5) Saving results

In [31]:

pred_y = model.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("data/XGBOOST_13.csv", index=True, index_label='Id')


In [32]:

pred_probs = model.predict_proba(X_test)
np.save("data/PROBS_XGBOOST_13", pred_probs)