In [1]:
import numpy as np
import random

import pandas as pd
from scipy import stats

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier

from xgboost import XGBClassifier

In [2]:
# set seed
seed = 42
random.seed(seed)
np.random.seed(seed)

In [3]:
train_data=pd.read_csv("train.csv")
train_data.head()

Unnamed: 0,UID,AgriculturalPostalZone,AgricultureZoningCode,CropFieldConfiguration,CropSpeciesVariety,CultivatedAndWildArea,CultivatedAreaSqft1,DistrictId,FarmClassification,FarmEquipmentArea,...,TotalTaxAssessed,TotalValue,TownId,TypeOfIrrigationSystem,UndergroundStorageSqft,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,WaterReservoirCount,Target
0,12998,291674,0.0,,3.0,,1136.0,1.0,,,...,8636.716,456255.6,118.0,,,2018.0,2.0,2.0,,high
1,20860,164397,28.0,,4.0,,2083.0,1.0,,,...,18464.292,996887.6,24.0,1.0,,2018.0,3.0,3.0,1.0,medium
2,75725,616532,0.0,,2.0,,922.0,1.0,,,...,15594.568,1043780.0,9.0,1.0,,2018.0,1.0,1.0,,medium
3,106521,942111,43.0,,7.0,,,1.0,,,...,8494.618,435734.8,114.0,,,2020.0,3.0,3.0,,low
4,99467,475557,38.0,,3.0,,2225.0,3.0,,0.0,...,13517.284,885400.0,6.0,,,2020.0,4.0,4.0,,medium


In [4]:
test_data=pd.read_csv("test.csv")

In [5]:
# Count missing values for each column
missing_values = train_data.isnull().sum()

# Create a DataFrame to store the count of missing values
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'MissingCount': missing_values.values
})

# Add a column to show the percentage of missing values
missing_df['MissingPercentage'] = (missing_df['MissingCount'] / len(train_data)) * 100

# Sort the DataFrame by the number of missing values in descending order
missing_df.sort_values(by='MissingCount', ascending=False, inplace=True)

# Reset index for readability
missing_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(missing_df)

                          Column  MissingCount  MissingPercentage
0             FarmClassification        112552          99.984898
1       PerimeterGuardPlantsArea        112525          99.960913
2         UndergroundStorageSqft        112512          99.949364
3                 FieldZoneLevel        112512          99.949364
4             HarvestStorageSqft        112457          99.900505
5                  HasGreenHouse        112305          99.765477
6         CropFieldConfiguration        112274          99.737939
7          FieldConstructionType        112239          99.706846
8          CultivatedAndWildArea        112027          99.518518
9                FieldShadeCover        111701          99.228917
10                 ReservoirType        111477          99.029928
11            TotalReservoirSize        111332          98.901118
12           ReservoirWithFilter        111032          98.634615
13                HasPestControl        109940          97.664544
14        

- null rate for dropping set to > 60%.
- Try 30% and 10%

In [6]:
# Drop columns with missing percentage greater than 60%
columns_to_drop = missing_df[missing_df['MissingPercentage'] > 60]['Column'].tolist()

# Drop the identified columns from the DataFrame
train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the updated DataFrame shape after dropping columns
print(f"Updated shape of the DataFrame: {train_data.shape}")

Updated shape of the DataFrame: (112569, 28)


In [7]:
# drop same columns from test data
test_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Display the updated DataFrame shape after dropping columns
print(f"Updated shape of the DataFrame: {test_data.shape}")

Updated shape of the DataFrame: (15921, 27)


In [8]:
# Set the 'UID' column as the index
train_data.set_index('UID', inplace=True)

# Display the updated DataFrame to confirm the change
# print(train_data.head())
train_data.head()

Unnamed: 0_level_0,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FarmingUnitCount,FieldEstablishedYear,FieldSizeSqft,HarvestProcessingType,LandUsageType,...,TaxAgrarianValue,TaxLandValue,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc,Target
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12998,291674,0.0,3.0,1136.0,1.0,1.0,1926.0,6265.0,2.0,1.0,...,81652.8,374602.8,1136.0,8636.716,456255.6,118.0,2018.0,2.0,2.0,high
20860,164397,28.0,4.0,2083.0,1.0,1.0,1981.0,10252.0,1.0,1.0,...,323700.8,673186.8,2083.0,18464.292,996887.6,24.0,2018.0,3.0,3.0,medium
75725,616532,0.0,2.0,922.0,1.0,1.0,1931.0,4771.0,1.0,1.0,...,87440.0,956340.0,922.0,15594.568,1043780.0,9.0,2018.0,1.0,1.0,medium
106521,942111,43.0,7.0,,1.0,3.0,1964.0,5737.0,,8.0,...,134075.2,301659.6,3202.0,8494.618,435734.8,114.0,2020.0,3.0,3.0,low
99467,475557,38.0,3.0,2225.0,3.0,,2009.0,,,2.0,...,144000.0,741400.0,2225.0,13517.284,885400.0,6.0,2020.0,4.0,4.0,medium


In [9]:
# Set the 'UID' column as the index
test_data.set_index('UID', inplace=True)

# Display the updated DataFrame to confirm the change
# print(test_data.head())
test_data.head()

Unnamed: 0_level_0,AgriculturalPostalZone,AgricultureZoningCode,CropSpeciesVariety,CultivatedAreaSqft1,DistrictId,FarmingUnitCount,FieldEstablishedYear,FieldSizeSqft,HarvestProcessingType,LandUsageType,...,StorageAndFacilityCount,TaxAgrarianValue,TaxLandValue,TotalCultivatedAreaSqft,TotalTaxAssessed,TotalValue,TownId,ValuationYear,WaterAccessPoints,WaterAccessPointsCalc
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130000,475712,0.0,5.0,2870.0,1.0,1.0,2009.0,9899.0,1.0,1.0,...,0.0,166216.0,153157.4,2870.0,9540.432,319373.4,52.0,2020.0,3.0,3.0
129101,101762,46.0,3.0,1291.0,3.0,,1975.0,6457.0,,1.0,...,5.0,132000.0,673200.0,1291.0,11064.284,805200.0,47.0,2020.0,2.0,2.0
147876,309344,19.0,2.0,1074.0,1.0,1.0,1970.0,30408.0,1.0,2.0,...,0.0,98530.4,207336.8,1074.0,5789.762,305867.2,14.0,2020.0,2.0,2.0
122624,689775,19.0,3.0,1595.0,1.0,1.0,1979.0,296757.0,1.0,2.0,...,0.0,135032.8,389565.0,1595.0,9440.486,524597.8,,2020.0,2.0,2.0
159920,445333,20.0,1.0,768.0,2.0,,1985.0,,,2.0,...,0.0,59100.8,633872.8,768.0,8384.64,692973.6,10.0,2020.0,1.0,1.0


In [10]:
# Define the mapping for 'Target' column
target_mapping = {'low': 0, 'medium': 1, 'high': 2}

# Apply the mapping to the 'Target' column
train_labels = train_data['Target'].map(target_mapping)

# Display the first few rows of the labels to verify the mapping
print(train_labels.head())

UID
12998     2
20860     1
75725     1
106521    0
99467     1
Name: Target, dtype: int64


In [11]:
train_data = train_data.drop(columns=['TownId','Target','DistrictId'])

test_data = test_data.drop(columns=['TownId','DistrictId'])

In [12]:
def fill_missing_values(df):
    # Define the columns based on their type
    categorical_columns = [
        'HarvestProcessingType', 'SoilFertilityType', 'AgricultureZoningCode',
        'ValuationYear', 'NationalRegionCode', 'StorageAndFacilityCount', 'RawLocationId',
        'LandUsageType', 'CropSpeciesVariety', 'AgriculturalPostalZone'
    ]
    
    median_columns = [
        'FarmingUnitCount', 'FieldSizeSqft', 'CultivatedAreaSqft1', 'MainIrrigationSystemCount',
        'FieldEstablishedYear', 'TotalTaxAssessed', 'TaxLandValue', 'TotalCultivatedAreaSqft',
        'WaterAccessPoints', 'TaxAgrarianValue', 'TotalValue'
    ]
    
    mean_columns = [
        'WaterAccessPointsCalc', 'Longitude', 'Latitude'
    ]
    
    # Convert categorical columns to 'object' type if necessary
    for column in categorical_columns:
        if column in df.columns:
            df[column] = df[column].astype('object')

    # Fill missing values for categorical columns using mode
    for column in categorical_columns:
        if column in df.columns:
            if df[column].isnull().sum() > 0:
                try:
                    mode_value = df[column].mode(dropna=True)[0] if not df[column].mode().empty else None
                    if mode_value is not None:
                        df[column].fillna(mode_value, inplace=True)
                    else:
                        print(f"Warning: Could not find a mode for column {column}")
                except Exception as e:
                    print(f"Error while filling mode for column {column}: {e}")
    
    # Fill missing values for numerical columns using median
    for column in median_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
    
    # Fill missing values for numerical columns using mean
    for column in mean_columns:
        if column in df.columns and df[column].dtype in ['int64', 'float64']:
            if df[column].isnull().sum() > 0:
                mean_value = df[column].mean()
                df[column].fillna(mean_value, inplace=True)
    
    return df

# Fill missing values in the training data
train_data = fill_missing_values(train_data)

# Check if there are still missing values
missing_values = train_data.isnull().sum()
print("Missing values after filling:\n", missing_values[missing_values > 0])


Missing values after filling:
 Series([], dtype: int64)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [13]:
# fill missing values in test data
test_data = fill_missing_values(test_data)

# Check if there are still missing values
missing_values = test_data.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(mode_value, inplace=True)
  df[column].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermed

In [14]:
# get feature importance using random forest
from sklearn.ensemble import RandomForestClassifier

# Separate the features and target variable
X = train_data
y = train_labels

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=seed)

# Fit the model
rf.fit(X, y)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame to store the feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by feature importance in descending order
feature_importances_df.sort_values(by='Importance', ascending=False, inplace=True)

# Reset index for readability
feature_importances_df.reset_index(drop=True, inplace=True)

# Display the DataFrame
print(feature_importances_df)

                      Feature  Importance
0      AgriculturalPostalZone    0.081115
1            TaxAgrarianValue    0.079292
2                   Longitude    0.079231
3                    Latitude    0.078728
4            TotalTaxAssessed    0.078582
5                TaxLandValue    0.075649
6                  TotalValue    0.074709
7               FieldSizeSqft    0.072544
8     TotalCultivatedAreaSqft    0.071137
9         CultivatedAreaSqft1    0.066902
10       FieldEstablishedYear    0.066006
11              RawLocationId    0.051996
12         CropSpeciesVariety    0.021802
13          SoilFertilityType    0.017496
14      AgricultureZoningCode    0.013428
15              ValuationYear    0.012890
16          WaterAccessPoints    0.012246
17      WaterAccessPointsCalc    0.011829
18  MainIrrigationSystemCount    0.009050
19    StorageAndFacilityCount    0.007757
20              LandUsageType    0.006099
21      HarvestProcessingType    0.006000
22           FarmingUnitCount    0

In [15]:
# drop columns with importance less than 0.01
columns_to_drop = feature_importances_df[feature_importances_df['Importance'] < 0.01]['Feature'].tolist()

# Drop the identified columns from the DataFrame
train_data.drop(columns=columns_to_drop, axis=1, inplace=True)

# Drop the identified columns from test data
test_data.drop(columns=columns_to_drop, axis=1, inplace=True)

In [16]:
# print count of unique values in y
print(y.value_counts())

Target
1    67541
2    22514
0    22514
Name: count, dtype: int64


In [17]:
# make train data with 22514 data of each class

# Separate the data based on the target classes
# low_class = train_data[y == 0]
# medium_class = train_data[y == 1]
# high_class = train_data[y == 2]

# # Get the number of samples in each class
# low_class_count = len(low_class)
# medium_class_count = len(medium_class)
# high_class_count = len(high_class)

# # Set the number of samples to be selected from each class
# num_samples = 22514

# # Randomly sample data from each class
# low_class_sample = low_class.sample(n=num_samples, random_state=seed)
# medium_class_sample = medium_class.sample(n=num_samples, random_state=seed)
# high_class_sample = high_class.sample(n=num_samples, random_state=seed)

# # Concatenate the sampled data
# train_data_sampled = pd.concat([low_class_sample, medium_class_sample, high_class_sample])

# # Separate the features and target variable
# X_sampled = train_data_sampled
# y_sampled = y.loc[train_data_sampled.index]

# # Display the count of unique values in the target variable
# print(y_sampled.value_counts())

In [18]:
# do data split
# from sklearn.model_selection import train_test_split

# # Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(train_data, train_labels, test_size=0.2, random_state=seed)
# # X_train, X_valid, y_train, y_valid = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=seed)

# # Display the shapes of the training and validation sets
# print(f"X_train shape: {X_train.shape}")
# print(f"X_valid shape: {X_valid.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"y_valid shape: {y_valid.shape}")


In [19]:
# train random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)

# Fit the model
rf_model.fit(X_train, y_train)


# Make predictions on the validation set
valid_preds = rf_model.predict(X_valid)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_valid, valid_preds)

# Display the accuracy of the model
print(f"Accuracy: {accuracy}")

Accuracy: 0.5930087945278494


In [19]:
# random forest on full data
# Initialize the Random Forest Classifier
rf_model_full = RandomForestClassifier(n_estimators=100, random_state=seed)

# Fit the model
rf_model_full.fit(train_data, train_labels)

# Make predictions on the test set
test_preds = rf_model_full.predict(test_data)

# convert predictions to target values
target_mapping_inv = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping_inv)

# Create a DataFrame with the 'UID' column and the predictions
submission_df = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission_4.csv', index=False)

## ensemble models

In [20]:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=seed)
gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=seed)
ada_clf = AdaBoostClassifier(n_estimators=100, random_state=seed)

# create a voting classifier with soft voting
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_clf),
        ('gb', gb_clf),
        ('ada', ada_clf)
    ],
    voting='soft'
)

# Fit the model
ensemble_model.fit(X_train, y_train)

# Make predictions on the validation set
valid_preds = ensemble_model.predict(X_valid)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_valid, valid_preds)

# Display the accuracy of the model
print(f"Accuracy: {accuracy}")



Accuracy: 0.6019809896064671


In [21]:
# make predictinos on test data
test_preds = ensemble_model.predict(test_data)

In [22]:
# print count of unique values in predictions
print(np.unique(test_preds, return_counts=True))

(array([0, 1, 2]), array([  364, 15069,   488]))


In [23]:
# convert predictions to original target values
target_mapping = {v: k for k, v in target_mapping.items()}
test_preds = pd.Series(test_preds).map(target_mapping)

In [24]:
# make csv file for submission
submission = pd.DataFrame({
    'UID': test_data.index,
    'Target': test_preds
})

submission.to_csv('submission_v2.csv', index=False)

In [28]:
# Use XGBoost

# Initialize the XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, random_state=seed)

# Fit the model
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
valid_preds = xgb_model.predict(X_valid)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_valid, valid_preds)

# Display the accuracy of the model
print(f"Accuracy: {accuracy}")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:AgriculturalPostalZone: object

In [25]:
def make_predictions(test_fname, predictions_fname):
    # Load the test data
    test_data = pd.read_csv(test_fname)

    predictions = np.array([random.choice([0, 1, 2]) for _ in range(len(test_data))])

    # map 0 -> low, 1 -> medium, 2 -> high
    predictions = np.array(['low', 'medium', 'high'])[predictions]

    # Save the predictions to CSV file containing UID and Target columns
    pd.DataFrame({
        'UID': test_data['UID'],
        'Target': predictions
    }).to_csv(predictions_fname, index=False)


# make_predictions("test.csv", "predictions.csv")