<h1><b>[XIV - XGBOOST_15]</b></h1>

XGBoost, with :

- learning_rate=0.08,  
- colsample_bytree = 0.7,
- subsample = 1.0,
- n_estimators=350, 
- max_depth=15, 
- gamma=7,
- n_jobs=-1,

The result will be save in `data/XGBOOST_15.csv` and the probabilities in `data/PROBS_XGBOOST_15.npy`

Note that the original `data/PROBS_XGBOOST_15.npy` is already available.

___________________________

In [18]:
import geopandas as gpd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split

In [19]:
train_df = pd.read_csv("data/train_df_FINAL.csv")
test_df = pd.read_csv("data/test_df_FINAL.csv")

In [20]:
train_df = train_df[(train_df.index < 195453) | (train_df.index > 197302)]

In [21]:
change_types = train_df["change_type"].unique()
for change_type in change_types:
    for column in train_df.columns:
        if column == "geometry":
            continue
        train_df.loc[(train_df["change_type"]==change_type) & (train_df[column].isnull()), column] = train_df.loc[(train_df["change_type"]==change_type), column].mode()[0]

In [22]:
feature_names=[ 'urban_type_SparseUrban', 'urban_type_N,A', 'urban_type_Rural', 'urban_type_Industrial', 'urban_type_DenseUrban', 'urban_type_UrbanSlum',
                'urban_type_Dense Urban,Urban Slum', 'urban_type_Dense Urban,Industrial', 'urban_type_Sparse Urban,Industrial', 'urban_type_Sparse Urban,Urban Slum', 
                'geography_type_Farms', 'geography_type_GrassLand', 'geography_type_N,A', 'geography_type_Snow', 'geography_type_SparseForest', 'geography_type_River', 'geography_type_Coastal', 'geography_type_Lakes', 'geography_type_DenseForest', 'geography_type_Desert', 'geography_type_Hills', 'geography_type_BarrenLand', 
                'geography_type_Sparse Forest,Grass Land', 'geography_type_Sparse Forest,Farms', 'geography_type_Sparse Forest,Dense Forest,Grass Land', 'geography_type_Barren Land,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest', 'geography_type_Sparse Forest,Grass Land,Lakes', 'geography_type_Sparse Forest,Farms,Lakes', 'geography_type_River,Sparse Forest,Grass Land', 'geography_type_River,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest,Grass Land,Lakes', 'geography_type_Barren Land,Sparse Forest,Grass Land', 'geography_type_Dense Forest,Grass Land', 'geography_type_River,Sparse Forest,Farms', 
                'geourban_geography_type_Farms_urban_type_SparseUrban', 'geourban_geography_type_Farms_urban_type_Rural', 'geourban_geography_type_Farms_urban_type_Industrial', 'geourban_geography_type_Farms_urban_type_DenseUrban', 'geourban_geography_type_GrassLand_urban_type_SparseUrban', 'geourban_geography_type_GrassLand_urban_type_Rural', 'geourban_geography_type_GrassLand_urban_type_Industrial', 'geourban_geography_type_GrassLand_urban_type_DenseUrban', 'geourban_geography_type_SparseForest_urban_type_SparseUrban', 'geourban_geography_type_SparseForest_urban_type_Rural', 'geourban_geography_type_SparseForest_urban_type_Industrial', 'geourban_geography_type_SparseForest_urban_type_DenseUrban', 'geourban_geography_type_SparseForest_urban_type_UrbanSlum', 'geourban_geography_type_River_urban_type_SparseUrban', 'geourban_geography_type_River_urban_type_Rural', 'geourban_geography_type_River_urban_type_Industrial', 'geourban_geography_type_River_urban_type_DenseUrban', 'geourban_geography_type_Coastal_urban_type_SparseUrban', 'geourban_geography_type_Lakes_urban_type_SparseUrban', 'geourban_geography_type_Lakes_urban_type_Rural', 'geourban_geography_type_Lakes_urban_type_Industrial', 'geourban_geography_type_Lakes_urban_type_DenseUrban', 'geourban_geography_type_DenseForest_urban_type_SparseUrban', 'geourban_geography_type_DenseForest_urban_type_Rural', 'geourban_geography_type_DenseForest_urban_type_Industrial', 'geourban_geography_type_DenseForest_urban_type_DenseUrban', 'geourban_geography_type_DenseForest_urban_type_UrbanSlum', 'geourban_geography_type_Desert_urban_type_Industrial', 'geourban_geography_type_BarrenLand_urban_type_SparseUrban', 'geourban_geography_type_BarrenLand_urban_type_Rural', 'geourban_geography_type_BarrenLand_urban_type_Industrial', 'geourban_geography_type_BarrenLand_urban_type_DenseUrban', 
               
                'img_red_mean_date0', 'img_red_mean_date1', 'img_red_mean_date2', 'img_red_mean_date3', 'img_red_mean_date4', 
                'img_green_mean_date0', 'img_green_mean_date1', 'img_green_mean_date2', 'img_green_mean_date3', 'img_green_mean_date4',
                'img_blue_mean_date0', 'img_blue_mean_date1', 'img_blue_mean_date2', 'img_blue_mean_date3', 'img_blue_mean_date4', 
               
                'img_red_std_date0', 'img_red_std_date1', 'img_red_std_date2', 'img_red_std_date3', 'img_red_std_date4', 
                'img_green_std_date0', 'img_green_std_date1', 'img_green_std_date2', 'img_green_std_date3', 'img_green_std_date4', 
                'img_blue_std_date0', 'img_blue_std_date1', 'img_blue_std_date2', 'img_blue_std_date3', 'img_blue_std_date4', 
               
                'img_mean_date0', 'img_std_date0', 'img_mean_date1', 'img_std_date1', 'img_mean_date2', 'img_std_date2', 'img_mean_date3', 'img_std_date3', 'img_mean_date4', 'img_std_date4', 
                'img_mean_date_4-0', 'img_mean_date_1-0', 'img_mean_date_2-1', 'img_mean_date_3-2', 'img_mean_date_4-3', 'img_std_date_4-0', 'img_std_date_1-0', 'img_std_date_2-1', 'img_std_date_3-2', 'img_std_date_4-3', 
                'img_red_mean_date_4-0', 'img_red_mean_date_1-0', 'img_red_mean_date_2-1', 'img_red_mean_date_3-2', 'img_red_mean_date_4-3', 'img_green_mean_date_4-0', 'img_green_mean_date_1-0', 'img_green_mean_date_2-1', 'img_green_mean_date_3-2', 'img_green_mean_date_4-3', 'img_blue_mean_date_4-0', 'img_blue_mean_date_1-0', 'img_blue_mean_date_2-1', 'img_blue_mean_date_3-2', 'img_blue_mean_date_4-3', 'img_red_std_date_4-0', 'img_red_std_date_1-0', 'img_red_std_date_2-1', 'img_red_std_date_3-2', 'img_red_std_date_4-3', 'img_green_std_date_4-0', 'img_green_std_date_1-0', 'img_green_std_date_2-1', 'img_green_std_date_3-2', 'img_green_std_date_4-3', 'img_blue_std_date_4-0', 'img_blue_std_date_1-0', 'img_blue_std_date_2-1', 'img_blue_std_date_3-2', 'img_blue_std_date_4-3', 
               
                'change_status_date0_Construction Done', 'change_status_date0_Construction Midway', 'change_status_date0_Construction Started', 'change_status_date0_Excavation', 'change_status_date0_Greenland', 'change_status_date0_Land Cleared', 'change_status_date0_Materials Dumped', 'change_status_date0_Materials Introduced', 'change_status_date0_Operational', 'change_status_date0_Prior Construction', 
                'change_status_date1_Construction Done', 'change_status_date1_Construction Midway', 'change_status_date1_Construction Started', 'change_status_date1_Excavation', 'change_status_date1_Greenland', 'change_status_date1_Land Cleared', 'change_status_date1_Materials Dumped', 'change_status_date1_Materials Introduced', 'change_status_date1_Operational', 'change_status_date1_Prior Construction', 
                'change_status_date2_Construction Done', 'change_status_date2_Construction Midway', 'change_status_date2_Construction Started', 'change_status_date2_Excavation', 'change_status_date2_Greenland', 'change_status_date2_Land Cleared', 'change_status_date2_Materials Dumped', 'change_status_date2_Materials Introduced', 'change_status_date2_Operational', 'change_status_date2_Prior Construction', 
                'change_status_date3_Construction Done', 'change_status_date3_Construction Midway', 'change_status_date3_Construction Started', 'change_status_date3_Excavation', 'change_status_date3_Greenland', 'change_status_date3_Land Cleared', 'change_status_date3_Materials Dumped', 'change_status_date3_Materials Introduced', 'change_status_date3_Operational', 'change_status_date3_Prior Construction', 
                'change_status_date4_Construction Done', 'change_status_date4_Construction Midway', 'change_status_date4_Construction Started', 'change_status_date4_Excavation', 'change_status_date4_Greenland', 'change_status_date4_Land Cleared', 'change_status_date4_Materials Dumped', 'change_status_date4_Materials Introduced', 'change_status_date4_Operational', 'change_status_date4_Prior Construction', 
               
                'change_status_Greenland', 'change_status_Land Cleared', 'change_status_Excavation', 'change_status_Construction Started', 'change_status_Construction Midway', 'change_status_Materials Dumped', 'change_status_Materials Introduced', 'change_status_Operational', 'change_status_Construction Done', 'change_status_Prior Construction', 
                'change_status_date0_encoded', 'change_status_date1_encoded', 'change_status_date2_encoded', 'change_status_date3_encoded', 'change_status_date4_encoded', 
                'change_status_date_4-0', 'change_status_date_1-0', 'change_status_date_2-1', 'change_status_date_3-2', 'change_status_date_4-3', 
               
               
                'date_1-0', 'date_2-1', 'date_3-2', 'date_4-3', 'date_4-0', 
                'date_0-0min', 
               
                'img_mean_date_4-0/Date', 'img_mean_date_1-0/Date', 'img_mean_date_2-1/Date', 'img_mean_date_3-2/Date', 'img_mean_date_4-3/Date', 'img_std_date_4-0/Date', 'img_std_date_1-0/Date', 'img_std_date_2-1/Date', 'img_std_date_3-2/Date', 'img_std_date_4-3/Date', 'img_red_mean_date_4-0/Date', 'img_red_mean_date_1-0/Date', 'img_red_mean_date_2-1/Date', 'img_red_mean_date_3-2/Date', 'img_red_mean_date_4-3/Date', 'img_green_mean_date_4-0/Date', 'img_green_mean_date_1-0/Date', 'img_green_mean_date_2-1/Date', 'img_green_mean_date_3-2/Date', 'img_green_mean_date_4-3/Date', 'img_blue_mean_date_4-0/Date', 'img_blue_mean_date_1-0/Date', 'img_blue_mean_date_2-1/Date', 'img_blue_mean_date_3-2/Date', 'img_blue_mean_date_4-3/Date', 'img_red_std_date_4-0/Date', 'img_red_std_date_1-0/Date', 'img_red_std_date_2-1/Date', 'img_red_std_date_3-2/Date', 'img_red_std_date_4-3/Date', 'img_green_std_date_4-0/Date', 'img_green_std_date_1-0/Date', 'img_green_std_date_2-1/Date', 'img_green_std_date_3-2/Date', 'img_green_std_date_4-3/Date', 'img_blue_std_date_4-0/Date', 'img_blue_std_date_1-0/Date', 'img_blue_std_date_2-1/Date', 'img_blue_std_date_3-2/Date', 'img_blue_std_date_4-3/Date', 'change_status_date_4-0/Date', 'change_status_date_1-0/Date', 'change_status_date_2-1/Date', 'change_status_date_3-2/Date', 'change_status_date_4-3/Date', 
               
                'geometry_area', 
                'geometry_perimeter', 
                'geometry_inscribed_circle_radius', 'geometry_compactness', 'geometry_convexity', 
                'geometry_vertices',
                'geometry_bboxwidth', 
                'geometry_bboxheight', 
                'geometry_bboxratio', 
                'geometry_bboxarea', 'geometry_bboxperimeter', 'geometry_aspectratio', 'geometry_diameter', 'geometry_minumum_bounding_circle', 'geometry_minimum_rotated_rectangle', 
              
                'neighb_density', 
                'neighb_nb', 
                'neighb_similar',
                'neighb_maxdiff_change_status_date0_0.001', 'neighb_maxdiff_change_status_date1_0.001', 'neighb_maxdiff_change_status_date2_0.001', 'neighb_maxdiff_change_status_date3_0.001', 'neighb_maxdiff_change_status_date4_0.001',
                'neighb_maxdiff_change_status_date0_0.003', 'neighb_maxdiff_change_status_date1_0.003', 'neighb_maxdiff_change_status_date2_0.003', 'neighb_maxdiff_change_status_date3_0.003', 'neighb_maxdiff_change_status_date4_0.003', 
                'neighb_maxdiff_meancolor_0', 'neighb_maxdiff_meancolor_1', 'neighb_maxdiff_meancolor_2', 'neighb_maxdiff_meancolor_3', 'neighb_maxdiff_meancolor_4',
                'neighb_maxdiff_stdcolor_0', 'neighb_maxdiff_stdcolor_1', 'neighb_maxdiff_stdcolor_2', 'neighb_maxdiff_stdcolor_3', 'neighb_maxdiff_stdcolor_4'
]

In [23]:
X = train_df[feature_names].to_numpy()
Y = train_df['change_type'].to_numpy()
X_test = test_df[feature_names].to_numpy()

In [28]:
X_train=X
Y_train=Y

In [24]:
print(X.shape, Y.shape, X_test.shape)

(294296, 306) (294296,) (120526, 306)


# Training

In [25]:
def printScores(y_true, y_predict):
    f1_score_micro = f1_score(y_true, y_predict, average='micro')
    f1_score_macro = f1_score(y_true, y_predict, average='macro')
    f1_score_weighted = f1_score(y_true, y_predict, average='weighted')
    train_conf_matrix = confusion_matrix(y_true, y_predict)
    print("F1 score micro", f1_score_micro)
    print("F1 score macro", f1_score_macro)
    print("F1 score weighted", f1_score_weighted)
    print(train_conf_matrix)

In [29]:
model = xgb.XGBClassifier(silent=False, 
         objective='multi:softmax',    
        num_class=6,               
        n_estimators=350,          
        max_depth=15,        
        learning_rate=0.08,
        n_jobs=-1,                  
        random_state=25,
        missing=np.nan,
        colsample_bytree=0.7,
        gamma=7, )
model.fit(X_train,Y_train)                    

# Saving results

In [30]:

pred_y = model.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("data/XGBOOST_15.csv", index=True, index_label='Id')


In [31]:

pred_probs = model.predict_proba(X_test)
np.save("data/PROBS_XGBOOST_15", pred_probs)
