In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
# load data
# train_data = pd.read_csv('../data/preprocessed_train.csv')
# test_data = pd.read_csv('../data/preprocessed_test.csv')

# train_data = pd.read_csv('../data/preprocessed_train_CorrDrop.csv')
# test_data = pd.read_csv('../data/preprocessed_test_CorrDrop.csv')

train_data = pd.read_csv('../data/preprocessed_train_KNNim.csv')
test_data = pd.read_csv('../data/preprocessed_test_KNNim.csv')

In [4]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

In [5]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Drop the 'Target' column from the training data
train_data = train_data.drop('Target', axis=1)

In [6]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)
num_samples_m = int(num_samples*1.0)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples_m, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

Target
0    22514
1    22514
2    22514
Name: count, dtype: int64


In [6]:
# Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.1, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.1, random_state=seed)
# X_train, y_train = X_sampled, y_sampled
# X_valid, y_valid = X_sampled, y_sampled
X_train, y_train = train_data, train_labels
X_valid, y_valid = train_data, train_labels

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")

X_train shape: (112569, 20)
X_valid shape: (112569, 20)
y_train shape: (112569,)
y_valid shape: (112569,)


## Random Forest

In [8]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features=5,
    max_depth=10,
    bootstrap=False,
    random_state=seed,
    n_jobs=-1, 
    class_weight='balanced'
)

# Fit the model
rf.fit(X_train, y_train)

# Accuracy and F1 score on the training set
train_preds = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')

# Accuracy and F1 score on the validation set
valid_preds = rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, valid_preds)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# Display the accuracy and F1 score
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")

Training Accuracy: 0.5428
Training F1 Score: 0.5001
Validation Accuracy: 0.5428
Validation F1 Score: 0.5001


In [9]:
# faeture importance
feature_importances = rf.feature_importances_
feature_importances = pd.Series(feature_importances, index=X_train.columns)
feature_importances = feature_importances.sort_values(ascending=False)

# Display 
print(feature_importances)

FieldEstablishedYear         0.147896
RawLocationId                0.096179
Longitude                    0.092435
TotalCultivatedAreaSqft      0.092296
TotalTaxAssessed             0.080653
Latitude                     0.076342
CultivatedAreaSqft1          0.072891
TotalValue                   0.060725
TaxAgrarianValue             0.055471
TaxLandValue                 0.052411
AgricultureZoningCode        0.029981
AgriculturalPostalZone       0.029967
WaterAccessPoints            0.023298
WaterAccessPointsCalc        0.020809
LandUsageType                0.017668
CropSpeciesVariety           0.014238
NationalRegionCode           0.011700
MainIrrigationSystemCount    0.010657
StorageAndFacilityCount      0.008449
ValuationYear                0.005934
dtype: float64


In [7]:
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    min_samples_leaf=4,
    max_features=5,
    max_depth=10,
    bootstrap=False,
    random_state=seed,
    n_jobs=-1, 
    class_weight='balanced'
)

f1 = []
acc = []

for (train, test), i  in zip(cv.split(X_train, y_train), range(n_splits)):
    rf.fit(X_train.iloc[train], y_train.iloc[train])

    train_preds = rf.predict(X_train.iloc[train])
    valid_preds = rf.predict(X_train.iloc[test])

    train_accuracy = accuracy_score(y_train.iloc[train], train_preds)
    train_f1 = f1_score(y_train.iloc[train], train_preds, average='macro')

    valid_accuracy = accuracy_score(y_train.iloc[test], valid_preds)
    valid_f1 = f1_score(y_train.iloc[test], valid_preds, average='macro')

    f1.append(valid_f1)
    acc.append(valid_accuracy)

    print(f"Fold {i + 1}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Training F1 Score: {train_f1:.4f}")
    print(f"Validation Accuracy: {valid_accuracy:.4f}")
    print(f"Validation F1 Score: {valid_f1:.4f}") 

print(f"Mean F1 Score: {np.mean(f1):.4f}")
print(f"Mean Accuracy: {np.mean(acc):.4f}")

Fold 1
Training Accuracy: 0.5371
Training F1 Score: 0.4993
Validation Accuracy: 0.4789
Validation F1 Score: 0.4256
Fold 2
Training Accuracy: 0.5438
Training F1 Score: 0.5034
Validation Accuracy: 0.4721
Validation F1 Score: 0.4185
Fold 3
Training Accuracy: 0.5456
Training F1 Score: 0.5043
Validation Accuracy: 0.4817
Validation F1 Score: 0.4230
Fold 4
Training Accuracy: 0.5467
Training F1 Score: 0.5047
Validation Accuracy: 0.4833
Validation F1 Score: 0.4278
Fold 5
Training Accuracy: 0.5467
Training F1 Score: 0.5053
Validation Accuracy: 0.4812
Validation F1 Score: 0.4237
Fold 6
Training Accuracy: 0.5429
Training F1 Score: 0.5028
Validation Accuracy: 0.4909
Validation F1 Score: 0.4380
Fold 7
Training Accuracy: 0.5466
Training F1 Score: 0.5032
Validation Accuracy: 0.4848
Validation F1 Score: 0.4293
Fold 8
Training Accuracy: 0.5440
Training F1 Score: 0.5038
Validation Accuracy: 0.4807
Validation F1 Score: 0.4246
Fold 9
Training Accuracy: 0.5473
Training F1 Score: 0.5052
Validation Accuracy: 

In [10]:
# make predictinos on test data
test_preds = rf.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/randomForest.csv', index=False)

## Hyperparameter Optimization

In [8]:
params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15],
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=seed, n_jobs=-1)

# Initialize the Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Display the best parameters
print(best_params)

{'max_depth': None, 'n_estimators': 100}


In [9]:
params = {
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [5, 'auto', 'sqrt', 'log2']
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    random_state=seed,
    n_jobs=-1
)

# Initialize the Grid Search
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Display the best parameters
print(best_params)

45 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
22 fits failed with the following error:
Traceback (most recent call last):
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/export/home/darpan/anaconda3/envs/env_1/lib/python3.11/site-packag

{'max_features': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [10]:
best_params = {
    'n_estimators': 100,
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': 5
}

In [11]:
# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=seed,
    n_jobs=-1
)

# Fit the model
rf.fit(X_train, y_train)

# Accuracy and F1 score on the training set
train_preds = rf.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')

# Accuracy and F1 score on the validation set
valid_preds = rf.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, valid_preds)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# Display the accuracy and F1 score
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")

Training Accuracy: 1.0000
Training F1 Score: 1.0000
Validation Accuracy: 1.0000
Validation F1 Score: 1.0000


In [15]:
# make predictinos on test data
test_preds = rf.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('../data/output/randomForest.csv', index=False)

In [16]:
# unique values in test predictions
print(test_preds.value_counts())

medium    9275
low       3496
high      3150
Name: count, dtype: int64
