In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
train_data=pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropFieldConfiguration,CropSpeciesVariety,CultivatedAndWildArea,CultivatedAreaSqft1,DistrictId,FarmClassification,FarmEquipmentArea,...,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,UndergroundStorageSqft,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,,3.0,,1136.0,1.0,,,...,8636.716,456255.6,118.0,,,2018.0,2.0,2.0,,high
1,20860,164397,28.0,,4.0,,2083.0,1.0,,,...,18464.292,996887.6,24.0,1.0,,2018.0,3.0,3.0,1.0,medium
2,75725,616532,0.0,,2.0,,922.0,1.0,,,...,15594.568,1043780.0,9.0,1.0,,2018.0,1.0,1.0,,medium
3,106521,942111,43.0,,7.0,,,1.0,,,...,8494.618,435734.8,114.0,,,2020.0,3.0,3.0,,low
4,99467,475557,38.0,,3.0,,2225.0,3.0,,0.0,...,13517.284,885400.0,6.0,,,2020.0,4.0,4.0,,medium


In [4]:
test_data=pd.read_csv("test.csv")

In [5]:
# Count missing values for each column
missing_values = train_data.isnull().sum()

# Create a DataFrame to store the count of missing values
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'MissingCount': missing_values.values
})

# Add a column to show the percentage of missing values
missing_df['MissingPercentage'] = (missing_df['MissingCount'] / len(train_data)) * 100

# Sort the DataFrame by the number of missing values in descending order
missing_df.sort_values(by='MissingCount', ascending=False, inplace=True)

# Reset index for readability
missing_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(missing_df)

                          Column  MissingCount  MissingPercentage
0             FarmClassification        112552          99.984898
1       PerimeterGuardPlantsArea        112525          99.960913
2         UndergroundStorageSqft        112512          99.949364
3                 FieldZoneLevel        112512          99.949364
4             HarvestStorageSqft        112457          99.900505
5                  HasGreenHouse        112305          99.765477
6         CropFieldConfiguration        112274          99.737939
7          FieldConstructionType        112239          99.706846
8          CultivatedAndWildArea        112027          99.518518
9                FieldShadeCover        111701          99.228917
10                 ReservoirType        111477          99.029928
11            TotalReservoirSize        111332          98.901118
12           ReservoirWithFilter        111032          98.634615
13                HasPestControl        109940          97.664544
14        

- null rate for dropping set to > 60%.
- Try 30% and 10%

In [6]:
# Drop columns with missing percentage greater than 60%
columns_to_drop = missing_df[missing_df['MissingPercentage'] > 10]['Column'].tolist()

# Drop the identified columns from the DataFrame
train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the updated DataFrame shape after dropping columns
print(f"Updated shape of the DataFrame: {train_data.shape}")

Updated shape of the DataFrame: (112569, 24)


In [7]:
# drop same columns from test data
test_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the updated DataFrame shape after dropping columns
print(f"Updated shape of the DataFrame: {test_data.shape}")

Updated shape of the DataFrame: (15921, 23)


In [8]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Display the updated DataFrame to confirm the change
# print(train_data.head())
train_data.head()

Unnamed: 0_level_0,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FieldEstablishedYear,LandUsageType,Latitude,Longitude,MainIrrigationSystemCount,...,TaxAgrarianValue,TaxLandValue,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,Target
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12998,291674,0.0,3.0,1136.0,1.0,1926.0,1.0,24660180.0,78507230.0,2.0,...,81652.8,374602.8,1136.0,8636.716,456255.6,118.0,2018.0,2.0,2.0,high
20860,164397,28.0,4.0,2083.0,1.0,1981.0,1.0,24367940.0,79118950.0,3.0,...,323700.8,673186.8,2083.0,18464.292,996887.6,24.0,2018.0,3.0,3.0,medium
75725,616532,0.0,2.0,922.0,1.0,1931.0,1.0,24557210.0,78642650.0,1.0,...,87440.0,956340.0,922.0,15594.568,1043780.0,9.0,2018.0,1.0,1.0,medium
106521,942111,43.0,7.0,,1.0,1964.0,8.0,24339310.0,78684070.0,3.0,...,134075.2,301659.6,3202.0,8494.618,435734.8,114.0,2020.0,3.0,3.0,low
99467,475557,38.0,3.0,2225.0,3.0,2009.0,2.0,24565920.0,77704470.0,4.0,...,144000.0,741400.0,2225.0,13517.284,885400.0,6.0,2020.0,4.0,4.0,medium


In [9]:
# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

# Display the updated DataFrame to confirm the change
# print(test_data.head())
test_data.head()

Unnamed: 0_level_0,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FieldEstablishedYear,LandUsageType,Latitude,Longitude,MainIrrigationSystemCount,...,StorageAndFacilityCount,TaxAgrarianValue,TaxLandValue,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130000,475712,0.0,5.0,2870.0,1.0,2009.0,1.0,25030830.0,78693660.0,3.0,...,0.0,166216.0,153157.4,2870.0,9540.432,319373.4,52.0,2020.0,3.0,3.0
129101,101762,46.0,3.0,1291.0,3.0,1975.0,1.0,24649480.0,77769450.0,2.0,...,5.0,132000.0,673200.0,1291.0,11064.284,805200.0,47.0,2020.0,2.0,2.0
147876,309344,19.0,2.0,1074.0,1.0,1970.0,2.0,24162570.0,78802340.0,2.0,...,0.0,98530.4,207336.8,1074.0,5789.762,305867.2,14.0,2020.0,2.0,2.0
122624,689775,19.0,3.0,1595.0,1.0,1979.0,2.0,24376550.0,78886890.0,2.0,...,0.0,135032.8,389565.0,1595.0,9440.486,524597.8,,2020.0,2.0,2.0
159920,445333,20.0,1.0,768.0,2.0,1985.0,2.0,24024290.0,79113690.0,1.0,...,0.0,59100.8,633872.8,768.0,8384.64,692973.6,10.0,2020.0,1.0,1.0


In [10]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Display the first few rows of the labels to verify the mapping
print(train_labels.head())

UID
12998     2
20860     1
75725     1
106521    0
99467     1
Name: Target, dtype: int64


In [11]:
train_data = train_data.drop(columns=['TownId','Target','DistrictId'])

test_data = test_data.drop(columns=['TownId','DistrictId'])

In [12]:
def fill_missing_values(df):
    # Define the columns based on their type
    categorical_columns = [
        'HarvestProcessingType', 'SoilFertilityType', 'AgricultureZoningCode',
        'ValuationYear', 'NationalRegionCode', 'StorageAndFacilityCount', 'RawLocationId',
        'LandUsageType', 'CropSpeciesVariety', 'AgriculturalPostalZone'
    ]
    
    median_columns = [
        'FarmingUnitCount', 'FieldSizeSqft', 'CultivatedAreaSqft1', 'MainIrrigationSystemCount',
        'FieldEstablishedYear', 'TotalTaxAssessed', 'TaxLandValue', 'TotalCultivatedAreaSqft',
        'WaterAccessPoints', 'TaxAgrarianValue', 'TotalValue'
    ]
    
    mean_columns = [
        'WaterAccessPointsCalc', 'Longitude', 'Latitude'
    ]
    
    # Convert categorical columns to 'object' type if necessary
    for column in categorical_columns:
        if column in df.columns:
            df[column] = df[column].astype('object')

    # Fill missing values for categorical columns using mode
    for column in categorical_columns:
        if column in df.columns:
            if df[column].isnull().sum() > 0:
                try:
                    mode_value = df[column].mode(dropna=True)[0] if not df[column].mode().empty else None
                    if mode_value is not None:
                        df[column].fillna(mode_value, inplace=True)
                    else:
                        print(f"Warning: Could not find a mode for column {column}")
                except Exception as e:
                    print(f"Error while filling mode for column {column}: {e}")
    
    # Fill missing values for numerical columns using median
    for column in median_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
    
    # Fill missing values for numerical columns using mean
    for column in mean_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                mean_value = df[column].mean()
                df[column].fillna(mean_value, inplace=True)
    
    return df

# Fill missing values in the training data
train_data = fill_missing_values(train_data)

# Check if there are still missing values
missing_values = train_data.isnull().sum()
print("Missing values after filling:\n", missing_values[missing_values > 0])


Missing values after filling:
 Series([], dtype: int64)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [13]:
# fill missing values in test data
test_data = fill_missing_values(test_data)

# Check if there are still missing values
missing_values = test_data.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [14]:
# convert Agriculturepostalzone to int
train_data['AgriculturalPostalZone'] = train_data['AgriculturalPostalZone'].astype(int)
test_data['AgriculturalPostalZone'] = test_data['AgriculturalPostalZone'].astype(int)

## SMOTE

In [15]:
# sampling imbalance class with SMOTE 
from imblearn.over_sampling import SMOTE
import collections

counter = collections.Counter(train_labels)
print(f"Before SMOTE: {counter}")
smote = SMOTE(sampling_strategy='auto', random_state=seed)

train_data, train_labels = smote.fit_resample(train_data, train_labels)
counter = collections.Counter(train_labels)
print(f"After SMOTE: {counter}")


Before SMOTE: Counter({1: 67541, 2: 22514, 0: 22514})
After SMOTE: Counter({2: 67541, 1: 67541, 0: 67541})


In [16]:
# get feature importance using random forest
from sklearn.ensemble import RandomForestClassifier

# Separate the features and target variable
X = train_data
y = train_labels

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=seed)

# Fit the model
rf.fit(X, y)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame to store the feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by feature importance in descending order
feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Reset index for readability
feature_importances_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(feature_importances_df)

                      Feature  Importance
0      AgriculturalPostalZone    0.073093
1                   Longitude    0.071440
2                    Latitude    0.070229
3            TotalTaxAssessed    0.068707
4            TaxAgrarianValue    0.067848
5        FieldEstablishedYear    0.066602
6                TaxLandValue    0.066412
7                  TotalValue    0.066357
8     TotalCultivatedAreaSqft    0.060654
9         CultivatedAreaSqft1    0.059108
10              ValuationYear    0.055788
11              RawLocationId    0.054944
12         CropSpeciesVariety    0.049404
13  MainIrrigationSystemCount    0.032930
14      WaterAccessPointsCalc    0.032868
15          WaterAccessPoints    0.031775
16      AgricultureZoningCode    0.030641
17              LandUsageType    0.024751
18    StorageAndFacilityCount    0.013959
19         NationalRegionCode    0.002489


In [17]:
# drop columns with importance less than 0.01
columns_to_drop = feature_importances_df[feature_importances_df['Importance'] < 0.05]['Feature'].tolist()

# Drop the identified columns from the DataFrame
train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Drop the identified columns from test data
test_data.drop(columns=columns_to_drop, axis=1, inplace=True)

In [18]:
# print count of unique values in y
print(y.value_counts())

Target
2    67541
1    67541
0    67541
Name: count, dtype: int64


In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 202623 entries, 0 to 202622
Data columns (total 12 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   AgriculturalPostalZone   202623 non-null  int64  
 1   CultivatedAreaSqft1      202623 non-null  float64
 2   FieldEstablishedYear     202623 non-null  float64
 3   Latitude                 202623 non-null  float64
 4   Longitude                202623 non-null  float64
 5   RawLocationId            202623 non-null  float64
 6   TaxAgrarianValue         202623 non-null  float64
 7   TaxLandValue             202623 non-null  float64
 8   TotalCultivatedAreaSqft  202623 non-null  float64
 9   TotalTaxAssessed         202623 non-null  float64
 10  TotalValue               202623 non-null  float64
 11  ValuationYear            202623 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 18.6 MB


In [None]:
# add uid to train and test data and save cleaned data
# train_data['UID'] = train_data.index
# test_data['UID'] = test_data.index

# train_data['Target'] = train_labels

# train_data.to_csv("cleaned_train.csv", index=False)
# test_data.to_csv("cleaned_test.csv", index=False)

In [20]:
from sklearn.preprocessing import StandardScaler

# Standardize the features before training
scaler = StandardScaler()
train_data = pd.DataFrame(scaler.fit_transform(train_data), columns=train_data.columns, index=train_data.index)
test_data = pd.DataFrame(scaler.transform(test_data), columns=test_data.columns, index=test_data.index)

# train_data_norm = scaler.fit_transform(train_data)
# test_data_norm = scaler.fit_transform(test_data)

In [None]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
low_class = train_data[train_labels == 0]
medium_class = train_data[train_labels == 1]
high_class = train_data[train_labels == 2]

# Get the number of samples in each class
low_class_count = len(low_class)
medium_class_count = len(medium_class)
high_class_count = len(high_class)

# Set the number of samples to be selected from each class
num_samples = min(low_class_count, medium_class_count, high_class_count)

# Randomly sample data from each class
low_class_sample = low_class.sample(n=num_samples, random_state=seed)
medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# Concatenate the sampled data
train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# Separate the features and target variable
X_sampled = train_data_sampled
y_sampled = train_labels.loc[train_data_sampled.index]

# Display the count of unique values in the target variable
print(y_sampled.value_counts())

In [21]:
# do data split
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.05, random_state=seed)
# X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=seed)

# Display the shapes of the training and validation sets
print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_valid shape: {y_valid.shape}")


X_train shape: (192491, 12)
X_valid shape: (10132, 12)
y_train shape: (192491,)
y_valid shape: (10132,)


In [22]:
# train random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)

# Fit the model
rf_model.fit(X_train, y_train)

# Accuracy and F1 score on the training set
train_preds = rf_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')

# Accuracy and F1 score on the validation set
valid_preds = rf_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, valid_preds)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# Display the accuracy and F1 score
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")

Training Accuracy: 1.0000
Training F1 Score: 1.0000
Validation Accuracy: 0.7138
Validation F1 Score: 0.7137


In [23]:
# make predictinos on test data
test_preds = rf_model.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('submission_10.csv', index=False)

## Random Forest Full

In [None]:
# random forest on full data
# Initialize the Random Forest Classifier
rf_model_full = RandomForestClassifier(n_estimators=100, random_state=seed)

# Fit the model
rf_model_full.fit(train_data, train_labels)

# Make predictions on the test set
test_preds = rf_model_full.predict(test_data)

# convert predictions to target values
target_mapping_inv = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping_inv)

# Create a DataFrame with the 'UID' column and the predictions
submission_df = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission_4.csv', index=False)

## Naive Bayes

In [None]:
# use naive bayes
from sklearn.naive_bayes import GaussianNB

# Initialize the Gaussian Naive Bayes Classifier
nb_model = GaussianNB()

# Fit the model
nb_model.fit(X_train, y_train)

# Accuracy and F1 score on the training set
train_preds = nb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
train_f1 = f1_score(y_train, train_preds, average='macro')

# Accuracy and F1 score on the validation set
valid_preds = nb_model.predict(X_valid)
valid_accuracy = accuracy_score(y_valid, valid_preds)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# Display the accuracy and F1 score
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")

In [None]:
# get the predictions on test data
test_preds = nb_model.predict(test_data)

# convert predictions to target values
target_mapping_inv = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping_inv)

# Create a DataFrame with the 'UID' column and the predictions
submission_df = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission_5.csv', index=False)

## ensemble models

In [None]:
# rf_clf = RandomForestClassifier(n_estimators=100, random_state=seed)
# gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=seed)
# ada_clf = AdaBoostClassifier(n_estimators=100, random_state=seed)

# # create a voting classifier with soft voting
# ensemble_model = VotingClassifier(
#     estimators=[
#         ('rf', rf_clf),
#         ('gb', gb_clf),
#         ('ada', ada_clf)
#     ],
#     voting='soft'
# )

# # Fit the model
# ensemble_model.fit(X_train, y_train)

# # Make predictions on the validation set
# valid_preds = ensemble_model.predict(X_valid)

# # Calculate the accuracy of the model
# accuracy = accuracy_score(y_valid, valid_preds)

# # Calculate the F1 score of the model
# f1 = f1_score(y_valid, valid_preds, average='macro')
# print(f"F1 Score: {f1}")

# # Display the accuracy of the model
# print(f"Accuracy: {accuracy}")

## Ensemble 2

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score

# Initialize classifiers with a random seed
seed = 42
rf_clf = RandomForestClassifier(n_estimators=100, random_state=seed)
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=seed)
ada_clf = AdaBoostClassifier(n_estimators=100, random_state=seed)
log_reg_clf = LogisticRegression(max_iter=1000, random_state=seed)
svc_clf = SVC(kernel='rbf', probability=True, random_state=seed)  # SVM with soft voting
nb_clf = GaussianNB()

# Create a voting classifier with soft voting
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('gb', gb_clf),
        ('ada', ada_clf),
        ('log_reg', log_reg_clf),
        ('svc', svc_clf),
        ('nb', nb_clf)
    ],
    voting='soft'
)

# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate on the training set
train_preds = ensemble_model.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')

# Evaluate on the validation set
valid_preds = ensemble_model.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")


In [None]:
# make predictinos on test data
test_preds = ensemble_model.predict(test_data)

In [None]:
# print count of unique values in predictions
print(np.unique(test_preds, return_counts=True))

In [None]:
# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

In [None]:
# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('submission_6.csv', index=False)

## XGBoost

In [24]:
# Use XGBoost

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, random_state=seed)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
train_preds = xgb_model.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')
train_accuracy = accuracy_score(y_train, train_preds)

# Make predictions on the validation set
valid_preds = xgb_model.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')
valid_accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")

Training F1 Score: 0.6326
Validation F1 Score: 0.5792
Training Accuracy: 0.6416
Validation Accuracy: 0.5914


In [25]:
# ver_1 xgboost
xgb_v1 = XGBClassifier(
    objective='multi:softmax',
    num_class=3,
    n_estimators=200,
    max_depth=2,
    random_state=seed
) 

# Fit the model
xgb_v1.fit(X_train, y_train)

# get accuracy and f1 score on training set and validation set
train_preds = xgb_v1.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')
train_accuracy = accuracy_score(y_train, train_preds)

valid_preds = xgb_v1.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')
valid_accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")



Training F1 Score: 0.5159
Validation F1 Score: 0.5166
Training Accuracy: 0.5322
Validation Accuracy: 0.5332


## Grid Search

In [26]:
cv_params = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.1, 0.3, 0.5],
}

csv = GridSearchCV(
    estimator=XGBClassifier(random_state=seed),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

{'learning_rate': 0.5, 'n_estimators': 400}


In [30]:
cv_params = {
    'max_depth': [ 2, 3, 4, 5, 6, 7],
    'min_child_weight': [1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 400,
    'learning_rate': 0.5, 
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

{'max_depth': 7, 'min_child_weight': 1}


In [31]:
cv_params = {
    'subsample': [0.6, 0.8, 0,9, 1.0],
    'max_delta_step': [0, 1, 2, 3, 4]
}

fixed_params = {
    'n_estimators': 400,
    'learning_rate': 0.5,
    'max_depth': 7,
    'min_child_weight': 1,
    'random_state': seed
}

csv = GridSearchCV(
    estimator=XGBClassifier(**fixed_params),
    param_grid=cv_params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1
)

csv.fit(X_train, y_train)

# Get the best parameters
best_params = csv.best_params_

# Display the best parameters
print(best_params)

25 fits failed out of a total of 125.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/home/darpan/Documents/.venv/test/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/darpan/Documents/.venv/test/lib/python3.11/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/darpan/Documents/.venv/test/lib/python3.11/site-packages/xgboost/sklearn.py", line 1531, in fit
    self._Booster = train(
                    ^^^^^^
  File "/home/darpan/Documents/.venv/test/lib/python3.11/site-package

{'max_delta_step': 1, 'subsample': 0.8}


In [32]:
final_params = {
    'n_estimators': 400,
    'learning_rate': 0.5,
    'max_depth': 6,
    'min_child_weight': 3,
    'subsample': 0.8,
    'max_delta_step': 1,
    'random_state': seed
}

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(**final_params)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the training set
train_preds = xgb_model.predict(X_train)
train_f1 = f1_score(y_train, train_preds, average='macro')
train_accuracy = accuracy_score(y_train, train_preds)

# Make predictions on the validation set
valid_preds = xgb_model.predict(X_valid)
valid_f1 = f1_score(y_valid, valid_preds, average='macro')
valid_accuracy = accuracy_score(y_valid, valid_preds)

# Display the F1 scores
print(f"Training F1 Score: {train_f1:.4f}")
print(f"Validation F1 Score: {valid_f1:.4f}")
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Validation Accuracy: {valid_accuracy:.4f}")

Training F1 Score: 0.8273
Validation F1 Score: 0.6585
Training Accuracy: 0.8278
Validation Accuracy: 0.6619


In [35]:
# make predictinos on test data
test_preds = xgb_model.predict(test_data)

# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('submission_11.csv', index=False)

In [None]:
# Hyper parameter tuning for XGBoost to get max F1 macro score on validation set 
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

In [None]:
def make_predictions(test_fname, predictions_fname):
    # Load the test data
    test_data = pd.read_csv(test_fname)

    predictions = np.array([random.choice([0, 1, 2]) for _ in range(len(test_data))])

    # map 0 -> low, 1 -> medium, 2 -> high
    predictions = np.array(['low', 'medium', 'high'])[predictions]

    # Save the predictions to CSV file containing UID and Target columns
    pd.DataFrame({
        'UID': test_data['UID'],
        'Target': predictions
    }).to_csv(predictions_fname, index=False)


# make_predictions("test.csv", "predictions.csv")