<h1><b>[X - LightGBM]</b></h1>

The result will be save in `data/LightGBM.csv` and the probabilities in `data/PROBS_LIGHTGBM.npy`

Note that the original `data/PROBS_LIGHTGBM.npy` is already available.

___________________________

In [1]:
import lightgbm as lgb

In [2]:
lgb.__version__

'4.2.0'

In [3]:
import geopandas as gpd
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)  
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.model_selection import train_test_split

In [5]:
train_df = pd.read_csv("data/train_df_FINAL.csv")
test_df = pd.read_csv("data/test_df_FINAL.csv")

In [6]:
train_df = train_df[(train_df.index < 195453) | (train_df.index > 197302)]

In [7]:
change_types = train_df["change_type"].unique()
for change_type in change_types:
    for column in train_df.columns:
        if column == "geometry":
            continue
        train_df.loc[(train_df["change_type"]==change_type) & (train_df[column].isnull()), column] = train_df.loc[(train_df["change_type"]==change_type), column].mode()[0]

In [8]:
feature_names=[ 'urban_type_N,A', 'urban_type_UrbanSlum', 'urban_type_Rural', 'urban_type_DenseUrban', 'urban_type_Industrial', 'urban_type_SparseUrban', 
                #'urban_type_Dense Urban,Industrial', 'urban_type_Dense Urban,Urban Slum', 'urban_type_Sparse Urban,Industrial', 'urban_type_Sparse Urban,Urban Slum', 
               
                'geography_type_N,A', 'geography_type_Desert', 'geography_type_Farms', 'geography_type_DenseForest', 'geography_type_Hills', 'geography_type_River', 'geography_type_GrassLand', 'geography_type_Snow', 'geography_type_Lakes', 'geography_type_BarrenLand', 'geography_type_Coastal', 'geography_type_SparseForest', 
                #'geography_type_Sparse Forest,Grass Land', 'geography_type_Sparse Forest,Farms', 'geography_type_Sparse Forest,Dense Forest,Grass Land', 'geography_type_Barren Land,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest', 'geography_type_Sparse Forest,Grass Land,Lakes', 'geography_type_Sparse Forest,Farms,Lakes', 'geography_type_River,Sparse Forest,Grass Land', 'geography_type_River,Sparse Forest', 'geography_type_Sparse Forest,Dense Forest,Grass Land,Lakes', 'geography_type_Barren Land,Sparse Forest,Grass Land', 'geography_type_Dense Forest,Grass Land', 'geography_type_River,Sparse Forest,Farms', 
               
                #'geourban_geography_type_Desert_urban_type_Industrial', 'geourban_geography_type_Farms_urban_type_Rural', 'geourban_geography_type_Farms_urban_type_DenseUrban', 'geourban_geography_type_Farms_urban_type_Industrial', 'geourban_geography_type_Farms_urban_type_SparseUrban', 'geourban_geography_type_DenseForest_urban_type_UrbanSlum', 'geourban_geography_type_DenseForest_urban_type_Rural', 'geourban_geography_type_DenseForest_urban_type_DenseUrban', 'geourban_geography_type_DenseForest_urban_type_Industrial', 'geourban_geography_type_DenseForest_urban_type_SparseUrban', 'geourban_geography_type_River_urban_type_Rural', 'geourban_geography_type_River_urban_type_DenseUrban', 'geourban_geography_type_River_urban_type_Industrial', 'geourban_geography_type_River_urban_type_SparseUrban', 'geourban_geography_type_GrassLand_urban_type_Rural', 'geourban_geography_type_GrassLand_urban_type_DenseUrban', 'geourban_geography_type_GrassLand_urban_type_Industrial', 'geourban_geography_type_GrassLand_urban_type_SparseUrban', 'geourban_geography_type_Lakes_urban_type_Rural', 'geourban_geography_type_Lakes_urban_type_DenseUrban', 'geourban_geography_type_Lakes_urban_type_Industrial', 'geourban_geography_type_Lakes_urban_type_SparseUrban', 'geourban_geography_type_BarrenLand_urban_type_Rural', 'geourban_geography_type_BarrenLand_urban_type_DenseUrban', 'geourban_geography_type_BarrenLand_urban_type_Industrial', 'geourban_geography_type_BarrenLand_urban_type_SparseUrban', 'geourban_geography_type_Coastal_urban_type_SparseUrban', 'geourban_geography_type_SparseForest_urban_type_UrbanSlum', 'geourban_geography_type_SparseForest_urban_type_Rural', 'geourban_geography_type_SparseForest_urban_type_DenseUrban', 'geourban_geography_type_SparseForest_urban_type_Industrial', 'geourban_geography_type_SparseForest_urban_type_SparseUrban', 
               
                'img_red_mean_date0', 'img_red_mean_date1', 'img_red_mean_date2', 'img_red_mean_date3', 'img_red_mean_date4', 
                'img_green_mean_date0', 'img_green_mean_date1', 'img_green_mean_date2', 'img_green_mean_date3', 'img_green_mean_date4', 
                'img_blue_mean_date0', 'img_blue_mean_date1', 'img_blue_mean_date2', 'img_blue_mean_date3', 'img_blue_mean_date4', 
               
                'img_red_std_date0', 'img_red_std_date1', 'img_red_std_date2', 'img_red_std_date3', 'img_red_std_date4', 
                'img_green_std_date0', 'img_green_std_date1', 'img_green_std_date2', 'img_green_std_date3', 'img_green_std_date4', 
                'img_blue_std_date0', 'img_blue_std_date1', 'img_blue_std_date2', 'img_blue_std_date3', 'img_blue_std_date4', 
               
                'change_status_date0_Construction Done', 'change_status_date0_Construction Midway', 'change_status_date0_Construction Started', 'change_status_date0_Excavation', 'change_status_date0_Greenland', 'change_status_date0_Land Cleared', 'change_status_date0_Materials Dumped', 'change_status_date0_Materials Introduced', 'change_status_date0_Operational', 'change_status_date0_Prior Construction', 
                'change_status_date1_Construction Done', 'change_status_date1_Construction Midway', 'change_status_date1_Construction Started', 'change_status_date1_Excavation', 'change_status_date1_Greenland', 'change_status_date1_Land Cleared', 'change_status_date1_Materials Dumped', 'change_status_date1_Materials Introduced', 'change_status_date1_Operational', 'change_status_date1_Prior Construction', 
                'change_status_date2_Construction Done', 'change_status_date2_Construction Midway', 'change_status_date2_Construction Started', 'change_status_date2_Excavation', 'change_status_date2_Greenland', 'change_status_date2_Land Cleared', 'change_status_date2_Materials Dumped', 'change_status_date2_Materials Introduced', 'change_status_date2_Operational', 'change_status_date2_Prior Construction', 
                'change_status_date3_Construction Done', 'change_status_date3_Construction Midway', 'change_status_date3_Construction Started', 'change_status_date3_Excavation', 'change_status_date3_Greenland', 'change_status_date3_Land Cleared', 'change_status_date3_Materials Dumped', 'change_status_date3_Materials Introduced', 'change_status_date3_Operational', 'change_status_date3_Prior Construction', 
                'change_status_date4_Construction Done', 'change_status_date4_Construction Midway', 'change_status_date4_Construction Started', 'change_status_date4_Excavation', 'change_status_date4_Greenland', 'change_status_date4_Land Cleared', 'change_status_date4_Materials Dumped', 'change_status_date4_Materials Introduced', 'change_status_date4_Operational', 'change_status_date4_Prior Construction', 
               
                'change_status_Prior Construction', 'change_status_Greenland', 'change_status_Land Cleared', 'change_status_Excavation', 'change_status_Materials Dumped', 'change_status_Materials Introduced', 'change_status_Construction Started', 'change_status_Construction Midway', 'change_status_Construction Done', 'change_status_Operational', 
                'change_status_date0_encoded', 'change_status_date1_encoded', 'change_status_date2_encoded', 'change_status_date3_encoded', 'change_status_date4_encoded', 
                'change_status_date_4-0', 'change_status_date_1-0', 'change_status_date_2-1', 'change_status_date_3-2', 'change_status_date_4-3', 
               
                'img_mean_date0', 'img_std_date0', 'img_mean_date1', 'img_std_date1', 'img_mean_date2', 'img_std_date2', 'img_mean_date3', 'img_std_date3', 'img_mean_date4', 'img_std_date4', 
                'img_mean_date_4-0', 'img_mean_date_1-0', 'img_mean_date_2-1', 'img_mean_date_3-2', 'img_mean_date_4-3', 'img_std_date_4-0', 'img_std_date_1-0', 'img_std_date_2-1', 'img_std_date_3-2', 'img_std_date_4-3', 
               
                'img_red_mean_date_4-0', 'img_red_mean_date_1-0', 'img_red_mean_date_2-1', 'img_red_mean_date_3-2', 'img_red_mean_date_4-3', 'img_green_mean_date_4-0', 'img_green_mean_date_1-0', 'img_green_mean_date_2-1', 'img_green_mean_date_3-2', 'img_green_mean_date_4-3', 'img_blue_mean_date_4-0', 'img_blue_mean_date_1-0', 'img_blue_mean_date_2-1', 'img_blue_mean_date_3-2', 'img_blue_mean_date_4-3', 'img_red_std_date_4-0', 'img_red_std_date_1-0', 'img_red_std_date_2-1', 'img_red_std_date_3-2', 'img_red_std_date_4-3', 'img_green_std_date_4-0', 'img_green_std_date_1-0', 'img_green_std_date_2-1', 'img_green_std_date_3-2', 'img_green_std_date_4-3', 'img_blue_std_date_4-0', 'img_blue_std_date_1-0', 'img_blue_std_date_2-1', 'img_blue_std_date_3-2', 'img_blue_std_date_4-3', 
                
                'date_1-0', 'date_2-1', 'date_3-2', 'date_4-3', 'date_4-0',
                #'date_0-0min', 
                'img_mean_date_4-0/Date', 'img_mean_date_1-0/Date', 'img_mean_date_2-1/Date', 'img_mean_date_3-2/Date', 'img_mean_date_4-3/Date', 'img_std_date_4-0/Date', 'img_std_date_1-0/Date', 'img_std_date_2-1/Date', 'img_std_date_3-2/Date', 'img_std_date_4-3/Date', 'img_red_mean_date_4-0/Date', 'img_red_mean_date_1-0/Date', 'img_red_mean_date_2-1/Date', 'img_red_mean_date_3-2/Date', 'img_red_mean_date_4-3/Date', 'img_green_mean_date_4-0/Date', 'img_green_mean_date_1-0/Date', 'img_green_mean_date_2-1/Date', 'img_green_mean_date_3-2/Date', 'img_green_mean_date_4-3/Date', 'img_blue_mean_date_4-0/Date', 'img_blue_mean_date_1-0/Date', 'img_blue_mean_date_2-1/Date', 'img_blue_mean_date_3-2/Date', 'img_blue_mean_date_4-3/Date', 'img_red_std_date_4-0/Date', 'img_red_std_date_1-0/Date', 'img_red_std_date_2-1/Date', 'img_red_std_date_3-2/Date', 'img_red_std_date_4-3/Date', 'img_green_std_date_4-0/Date', 'img_green_std_date_1-0/Date', 'img_green_std_date_2-1/Date', 'img_green_std_date_3-2/Date', 'img_green_std_date_4-3/Date', 'img_blue_std_date_4-0/Date', 'img_blue_std_date_1-0/Date', 'img_blue_std_date_2-1/Date', 'img_blue_std_date_3-2/Date', 'img_blue_std_date_4-3/Date', 
                'change_status_date_4-0/Date', 'change_status_date_1-0/Date', 'change_status_date_2-1/Date', 'change_status_date_3-2/Date', 'change_status_date_4-3/Date', 
               
                'geometry_area', 'geometry_perimeter', 'geometry_inscribed_circle_radius', 'geometry_compactness', 'geometry_convexity', 'geometry_vertices', 'geometry_bboxwidth', 'geometry_bboxheight', 'geometry_bboxratio', 'geometry_bboxarea', 'geometry_bboxperimeter', 
                #'geometry_aspectratio', 
                'geometry_diameter', 'geometry_minumum_bounding_circle', 'geometry_minimum_rotated_rectangle', 
               
                'neighb_density', 'neighb_nb', 'neighb_similar', 
                'neighb_maxdiff_change_status_date0_0.001', 'neighb_maxdiff_change_status_date1_0.001', 'neighb_maxdiff_change_status_date2_0.001', 'neighb_maxdiff_change_status_date3_0.001', 'neighb_maxdiff_change_status_date4_0.001', 
                'neighb_maxdiff_change_status_date0_0.003', 'neighb_maxdiff_change_status_date1_0.003', 'neighb_maxdiff_change_status_date2_0.003', 'neighb_maxdiff_change_status_date3_0.003', 'neighb_maxdiff_change_status_date4_0.003', 
                'neighb_maxdiff_meancolor_0', 'neighb_maxdiff_meancolor_1', 'neighb_maxdiff_meancolor_2', 'neighb_maxdiff_meancolor_3', 'neighb_maxdiff_meancolor_4', 
                'neighb_maxdiff_stdcolor_0', 'neighb_maxdiff_stdcolor_1', 'neighb_maxdiff_stdcolor_2', 'neighb_maxdiff_stdcolor_3', 'neighb_maxdiff_stdcolor_4', 
               
                'neighb_meandiff_changestatus_0', 'neighb_meandiff_changestatus_1', 'neighb_meandiff_changestatus_2', 'neighb_meandiff_changestatus_3', 'neighb_meandiff_changestatus_4', 
                'neighb_meandiff_meancolor_0', 'neighb_meandiff_meancolor_1', 'neighb_meandiff_meancolor_2', 'neighb_meandiff_meancolor_3', 'neighb_meandiff_meancolor_4', 
                'neighb_meandiff_stdcolor_0', 'neighb_meandiff_stdcolor_1', 'neighb_meandiff_stdcolor_2', 'neighb_meandiff_stdcolor_3', 'neighb_meandiff_stdcolor_4'
            ]

In [9]:
X = train_df[feature_names].to_numpy()
Y = train_df['change_type'].to_numpy()
X_test = test_df[feature_names].to_numpy()

In [10]:

from sklearn.impute import SimpleImputer

# Handle NaN values using SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
imp_mean.fit(np.concatenate([X, X_test], axis=0))
X_test = imp_mean.transform(X_test)


In [11]:
print(X.shape, Y.shape, X_test.shape)

(294296, 270) (294296,) (120526, 270)


In [12]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.2, random_state=65, stratify=Y)

In [13]:
remove5 = True
if remove5:
    X_train = X_train[Y_train != 5]
    Y_train = Y_train[Y_train != 5]

In [14]:
def printScores(y_true, y_predict):
    f1_score_micro = f1_score(y_true, y_predict, average='micro')
    f1_score_macro = f1_score(y_true, y_predict, average='macro')
    f1_score_weighted = f1_score(y_true, y_predict, average='weighted')
    train_conf_matrix = confusion_matrix(y_true, y_predict)
    print("F1 score micro", f1_score_micro)
    print("F1 score macro", f1_score_macro)
    print("F1 score weighted", f1_score_weighted)
    print(train_conf_matrix)

In [15]:
if remove5:
    N = 5
else:
    N = 6

# Training

In [16]:
model = lgb.LGBMClassifier(objective="multiclass", num_class=N, num_leaves=256)


eval_set = [(X_train, Y_train), (X_valid, Y_valid)]
if remove5:
    eval_set = [(X_train, Y_train), (X_valid[Y_valid != 5], Y_valid[Y_valid != 5])]
    
#model.fit(X_train, Y_train, eval_set=eval_set)
model.fit(X[Y != 5], Y[Y != 5])

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.705114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 43838
[LightGBM] [Info] Number of data points in the train set: 294149, number of used features: 269
[LightGBM] [Info] Start training from score -2.236451
[LightGBM] [Info] Start training from score -3.025226
[LightGBM] [Info] Start training from score -0.692736
[LightGBM] [Info] Start training from score -1.079066
[LightGBM] [Info] Start training from score -5.404185


In [17]:
model.booster_.save_model('mode.txt')

<lightgbm.basic.Booster at 0x7dec4cc9ee30>

In [18]:

pred = model.predict(X_train)
printScores(Y_train, pred)


F1 score micro 0.8930133138420612
F1 score macro 0.9349141965058372
F1 score weighted 0.8925902246642756
[[ 25073      0     37     31      0]
 [     2  11400      7     15      0]
 [  2988     85 104590  10045      0]
 [   899    107  10960  68022      0]
 [     0      0      0      0   1058]]


In [19]:
#model.fit(X[Y != 5], Y[Y != 5])

1 > 0.7834

In [20]:

pred = model.predict(X_valid)
printScores(Y_valid, pred)


F1 score micro 0.8939347604485219
F1 score macro 0.7798688684989795
F1 score weighted 0.8932716941355393
[[ 6272     0     7     6     0     0]
 [    0  2855     1     0     0     0]
 [  727    22 26228  2450     0     0]
 [  242    16  2742 16997     0     0]
 [    0     0     0     0   265     0]
 [    4     3     8    15     0     0]]


In [21]:
"""
num_leave = 31 : 
F1 score micro 0.768824328916072
F1 score macro 0.5398432785011614
F1 score weighted 0.7684447101463885
[[ 5906    18    96   237     4    24]
 [   18  1987   160   657    13    21]
 [ 1215   113 23479  4502    37    81]
 [  579   368  5012 13850   105    83]
 [    6     4    68   158    29     0]
 [    4     4     7    12     1     2]]

num_leave 64
F1 score micro 0.7725790010193679
F1 score macro 0.5381640749369746
F1 score weighted 0.7742446783031189
[[ 5869    16   101   236     7    56]
 [   15  2028   156   624     4    29]
 [ 1210   112 23492  4359    43   211]
 [  577   336  4735 14062    83   204]
 [    5     5    65   165    23     2]
 [    4     2     8    15     1     0]]

num_leaves 120
F1 score micro 0.7769113149847094
F1 score macro 0.5471793296072588
F1 score weighted 0.7777244168599494
[[ 5833    14   124   261     9    44]
 [   16  2052   166   596     1    25]
 [ 1204    96 23615  4339    30   143]
 [  570   313  4698 14200    64   152]
 [    5     3    59   166    29     3]
 [    4     4     8    13     1     0]]


num_leaves 256
F1 score micro 0.7747366632687733
F1 score macro 0.5351609930666787
F1 score weighted 0.7779180907056056
[[ 5809    11   140   252    22    51]
 [   16  2039   149   596    18    38]
 [ 1188    95 23464  4308   135   237]
 [  540   283  4561 14269   122   222]
 [    6     3    60   173    20     3]
 [    4     2     9    15     0     0]]
"""

'\nnum_leave = 31 : \nF1 score micro 0.768824328916072\nF1 score macro 0.5398432785011614\nF1 score weighted 0.7684447101463885\n[[ 5906    18    96   237     4    24]\n [   18  1987   160   657    13    21]\n [ 1215   113 23479  4502    37    81]\n [  579   368  5012 13850   105    83]\n [    6     4    68   158    29     0]\n [    4     4     7    12     1     2]]\n\nnum_leave 64\nF1 score micro 0.7725790010193679\nF1 score macro 0.5381640749369746\nF1 score weighted 0.7742446783031189\n[[ 5869    16   101   236     7    56]\n [   15  2028   156   624     4    29]\n [ 1210   112 23492  4359    43   211]\n [  577   336  4735 14062    83   204]\n [    5     5    65   165    23     2]\n [    4     2     8    15     1     0]]\n\nnum_leaves 120\nF1 score micro 0.7769113149847094\nF1 score macro 0.5471793296072588\nF1 score weighted 0.7777244168599494\n[[ 5833    14   124   261     9    44]\n [   16  2052   166   596     1    25]\n [ 1204    96 23615  4339    30   143]\n [  570   313  4698

# Saving results

In [23]:

pred_y = model.predict(X_test)

## Save results to submission file
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("data/LIGHTGBM.csv", index=True, index_label='Id')


In [24]:

pred_probs = model.predict_proba(X_test)
np.save("data/PROBS_LIGHTGBM", pred_probs)
