In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
# from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
# train_data = pd.read_csv('../data/preprocessed_train.csv')
# test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_CorrDrop.csv')
# test_data = pd.read_csv('../data/preprocessed_test_CorrDrop.csv')

train_data = pd.read_csv('../data/preprocessed_train_KNNim.csv')
test_data = pd.read_csv('../data/preprocessed_test_KNNim.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# columns with less feature importance
# cols_to_drop = ['AgricultureZoningCode', 'CropSpeciesVariety', 'LandUsageType',
#        'ValuationYear', 'StorageAndFacilityCount', 'WaterAccessPoints',
#        'MainIrrigationSystemCount', 'WaterAccessPointsCalc',
#        'NationalRegionCode']

# cols_to_drop = ['MainIrrigationSystemCount',
#                 'NationalRegionCode',
#                 'ValuationYear',
#                 'StorageAndFacilityCount',
#                 'LandUsageType',
#                 'AgricultureZoningCode']


# train_data.drop(cols_to_drop, axis=1, inplace=True)
# test_data.drop(cols_to_drop, axis=1, inplace=True)

In [6]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [7]:
# apply pca to reduce the number of features
# from sklearn.decomposition import PCA

# pca = PCA(n_components=10)
# train_data = pd.DataFrame(pca.fit_transform(train_data), index=train_data.index)
# test_data = pd.DataFrame(pca.transform(test_data), index=test_data.index)

In [8]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
# low_class = train_data[train_labels == 0]
# medium_class = train_data[train_labels == 1]
# high_class = train_data[train_labels == 2]

# # Get the number of samples in each class
# low_class_count = len(low_class)
# medium_class_count = len(medium_class)
# high_class_count = len(high_class)

# # Set the number of samples to be selected from each class
# num_samples = min(low_class_count, medium_class_count, high_class_count)

# # Randomly sample data from each class
# low_class_sample = low_class.sample(n=num_samples, random_state=seed)
# medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
# high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# # Concatenate the sampled data
# train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# # Separate the features and target variable
# X_sampled = train_data_sampled
# y_sampled = train_labels.loc[train_data_sampled.index]

# # Display the count of unique values in the target variable
# print(y_sampled.value_counts())

In [9]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled
X_train, y_train = train_data, train_labels
X_valid, y_valid = train_data, train_labels

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (112569, 20)
X_valid shape: (112569, 20)
y_train shape: (112569,)
y_valid shape: (112569,)


In [10]:
train_data.isna().sum()

AgriculturalPostalZone     0
CropSpeciesVariety         0
CultivatedAreaSqft1        0
FieldEstablishedYear       0
Latitude                   0
Longitude                  0
RawLocationId              0
TaxAgrarianValue           0
TaxLandValue               0
TotalCultivatedAreaSqft    0
TotalTaxAssessed           0
TotalValue                 0
WaterAccessPoints          0
WaterAccessPointsCalc      0
dtype: int64

In [11]:
y_train

UID
12998     2
20860     1
75725     1
106521    0
99467     1
         ..
26998     1
135304    1
153756    2
129907    1
103354    0
Name: Target, Length: 112569, dtype: int64

## LightGBM

In [12]:
# initialize the models
lgbm = LGBMClassifier(
    random_state=seed,
    class_weight='balanced',
)

# fit the model
lgbm.fit(X_train, y_train)

# make predictions
train_preds = lgbm.predict(X_train)
valid_preds = lgbm.predict(X_valid)

# calculate the f1 score
train_f1 = f1_score(y_train, train_preds, average='macro')
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# calculate the accuracy
train_accuracy = accuracy_score(y_train, train_preds)
valid_accuracy = accuracy_score(y_valid, valid_preds)

# display the f1 score and accuracy
print(f"Train F1 Score: {train_f1}")
print(f"Valid F1 Score: {valid_f1}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Valid Accuracy: {valid_accuracy}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003769 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2795
[LightGBM] [Info] Number of data points in the train set: 112569, number of used features: 14
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Train F1 Score: 0.4784272694446421
Valid F1 Score: 0.4784272694446421
Train Accuracy: 0.5228437669340582
Valid Accuracy: 0.5228437669340582


In [13]:
# make feature importance dataframe
feature_importance = pd.DataFrame(lgbm.feature_importances_, index=X_train.columns, columns=['importance'])
feature_importance = feature_importance.sort_values('importance', ascending=False)

# display the dataframe
print(feature_importance)

                         importance
Latitude                       1063
Longitude                      1006
FieldEstablishedYear            875
TaxAgrarianValue                834
AgriculturalPostalZone          813
TotalTaxAssessed                760
RawLocationId                   695
CultivatedAreaSqft1             651
TaxLandValue                    650
TotalCultivatedAreaSqft         619
TotalValue                      570
CropSpeciesVariety              240
WaterAccessPoints               152
WaterAccessPointsCalc            72


## Cross Validation

In [10]:
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Initialize the LightGBM model
lgbm = LGBMClassifier(
    random_state=seed,
    class_weight='balanced',
    n_estimators=300,
    subsample=0.8,
    reg_lambda=0.1,
    reg_alpha=0.1,
    learning_rate=0.1,
    num_leaves=300,
    max_depth=10,
    min_child_samples=50,
    colsample_bytree=0.8,
)

f1 = []
accuracy = []

for (train, test), i  in zip(cv.split(X_train, y_train), range(n_splits)):
    lgbm.fit(X_train.iloc[train], y_train.iloc[train])

    train_preds = lgbm.predict(X_train.iloc[train])
    valid_preds = lgbm.predict(X_train.iloc[test])

    train_accuracy = accuracy_score(y_train.iloc[train], train_preds)
    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')

    valid_accuracy = accuracy_score(y_train.iloc[test], valid_preds)
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    accuracy.append(valid_accuracy)

    print(f"Fold {i + 1}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Training F1 Score: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}")
    print(f"Validation F1 Score: {valid_f1:.4f}") 

print(f"Average F1 Score: {np.mean(f1):.4f}")
print(f"Average Accuracy: {np.mean(accuracy):.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004283 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2900
[LightGBM] [Info] Number of data points in the train set: 101312, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
Fold 1
Training Accuracy: 0.7822
Training F1 Score: 0.7649
Validation Accuracy: 0.5008
Validation F1 Score: 0.4336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2906
[LightGBM] [Info] Number of data points in the train set: 101312, number of used features: 20
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start t

In [12]:
# make predictinos on test data
test_preds = lgbm.predict(test_data)

# reshape to 1D array
# test_preds = test_preds.ravel()

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

# submission.to_csv('../data/output/lightgbm.csv', index=False)

## Hyperparameter Tuning