# Imports & Setup

In [3]:
# Colab setup
from google.colab import drive
drive.mount('/content/gdrive')
# directory
%cd '/content/gdrive/My Drive/XYieldBoost'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/XYieldBoost


In [4]:
!pip install -r requirements.txt

Collecting catboost (from -r requirements.txt (line 6))
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [5]:
# Import system libraries
import os
import sys

# Import data cleaning libraries
import pandas as pd
import numpy as np
import calendar
from datetime import datetime

# Import machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import xgboost as xgb

# Import data visualisation libraries
import matplotlib.pyplot as plt

# Import warning libraries
import warnings
warnings.filterwarnings("ignore")

# Import preprocessing libraries
from preprocessing import clustering
from preprocessing import dim_reduction
from preprocessing import feature_selection
from preprocessing import scaling
from preprocessing import feature_engineering
from preprocessing import cleaning

# Preprocessing

In [6]:
# Preprocess data
train_path = "data/Train.csv"
test_path = "data/Test.csv"
df = cleaning.clean_data(train_path, test_path)
df = feature_engineering.get_features(df)
df = scaling.scale_features(df)
df = feature_selection.select_features(df)
df = dim_reduction.reduce_dim(df)
df = clustering.get_clusters(df)

# Split data into training and test sets
df_train = df[df['Yield'].isna() == False]
df_test = df[df['Yield'].isna() == True]

# Feature selection

In [8]:
top_cols = ['SeedlingsPerPit','Ganaura','CropOrgFYM','NoFertilizerAppln','BasalDAP',
            'BasalUrea','2appDaysUrea','Harv_hand_rent','Residue_length',
            'TransplantingIrrigationHours_per_Acre','TransIrriCost_per_Acre',
            'CropOrgFYM_per_Acre','BasalDAP_per_Acre','BasalUrea_per_Acre','1tdUrea_per_Acre',
            'Harv_hand_rent_per_Acre','TpIrrigationCost_Imputed_per_Acre',
            'Days_bw_SowTransp_Harv','Days_bw_Harv_Thresh','NursingDate_ModeDiff',
            'TillageDate_ModeDiff','HarvestDate_ModeDiff','ThreshingDate_ModeDiff',
            'Num_LandPrepMethod','Num_CropbasalFerts','Num_TopDressFert','Latitude',
            'Longitude','CropEstMethod_LineSowingAfterTillage','Threshing_method_machine',
            'Stubble_use_plowed_in_soil','LandPrepMethod_FourWheelTracRotavator_True',
            'LandPrepMethod_WetTillagePuddling_True','NursDetFactor_PreMonsoonShowers_True',
            'NursDetFactor_LabourAvailability_True','FirstTopDressFert_DAP_True',
            'HarvestMonth_November','ThreshingMonth_January','Block_Chehrakala',
            'PCropSolidOrgFertAppMethod_Broadcasting','PCropSolidOrgFertAppMethod_SoilApplied',
            'MineralFertAppMethod_1_Broadcasting','MineralFertAppMethod_1_SoilApplied','PC4',
            'PC10','PC21','top_shapley_k2_label_1', 'TpIrrigationHours_Imputed',
            'TpIrrigationCost_Imputed', 'SeedlingsPerPit_Imputed', 'NursingDate_ModeDiff_Imputed',
            '2appDaysUrea_Imputed']

# Remove the specified columns from top_cols
columns_to_remove = ['SeedlingsPerPit', 'TransplantingIrrigationHours', 'TransIrriCost', 'StandingWater',
                     '1appDaysUrea', '2appDaysUrea', 'TransplantingIrrigationHours_per_Acre',
                     'TransIrriCost_per_Acre', 'TransplantingIrrigationHours_per_Acre_capped',
                     'TransIrriCost_per_Acre_capped', 'Days_bw_Nurs_SowTransp', 'Days_bw_Nurs_Harv',
                     'Days_bw_Nurs_Till', 'NursingDate_ModeDiff', 'Days_bw_Nurs_SowTransp_ModeDiff',
                     'Days_bw_Nurs_Harv_ModeDiff', 'Days_bw_Nurs_Till_ModeDiff', '2appDaysUrea_MeanDiff']

for column in columns_to_remove:
    if column in top_cols:
     top_cols.remove(column)

# Modeling

In [9]:
# Split data and drop outcome-related columns
outcome_cols = ["Yield", "Yield_per_Acre", "New_Yield", "New_Yield_per_Acre"]
X, y = df_train.drop(outcome_cols, axis=1), df_train["New_Yield_per_Acre"]
X = X[top_cols]

# Initialize an array to store fold-wise predictions
k = 5
fold_wise_predictions = np.zeros((len(df_test), k))

# Define number of splits for k-fold cross-validation
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

# Iterate over each fold and train XGBoost model
for i, (train_idx, val_idx) in enumerate(kfold.split(X)):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Instantiate an XGBoost regressor model
    best_params = {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100,
                   'alpha': 0, 'lambda': 0}
    xgb_reg = xgb.XGBRegressor(**best_params, colsample_bytree=0.3)

    # Tuned Tree estimators
    lgbm = LGBMRegressor(boosting_type='goss', n_estimators=1000, learning_rate=0.08, num_leaves=100, max_depth=7)
    catboost = CatBoostRegressor(depth=10, iterations=1000, learning_rate=0.5, l2_leaf_reg=5)

    # Define the VotingRegressor
    voting_regressor = VotingRegressor(
        estimators=[
            ('lgbm', lgbm),
            ('catboost', catboost),
            ('xgboost', xgb_reg)
        ], weights=[2, 3, 1]
    )

    # Fit the model
    voting_regressor.fit(X_tr, y_tr)

    # Make predictions
    test_predictors = df_test.drop(outcome_cols, axis=1)[top_cols]
    test_folds_pred = voting_regressor.predict(
        test_predictors) * df_test["Acre"]

    # Store fold-wise predictions
    fold_wise_predictions[:, i] = test_folds_pred

# Calculate the average of predictions from each fold for each row
final_predictions = np.mean(fold_wise_predictions, axis=1)

# Add predictions to sample submission file
df_test_pred = pd.read_csv("data/SampleSubmission.csv")
df_test_pred['Yield'] = final_predictions
df_test_pred['Yield'] = np.where(df_test_pred['ID'] == 'ID_PMSOXFT4FYDW',
                                 df_test_pred['Yield'] * 10, df_test_pred['Yield'])

# Use training data minimum as floor value for predictions
df_test_pred['Yield'] = np.where(
    df_test_pred['Yield'] <= 4, 4, df_test_pred['Yield'])

# Export submission
df_test_pred.to_csv('predictions.csv', index=False)

print("---Predictions made---")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
22:	learn: 223.6923098	total: 1.6s	remaining: 1m 8s
23:	learn: 217.4872165	total: 1.67s	remaining: 1m 8s
24:	learn: 212.4272000	total: 1.74s	remaining: 1m 7s
25:	learn: 209.3260629	total: 1.79s	remaining: 1m 7s
26:	learn: 206.8895377	total: 1.84s	remaining: 1m 6s
27:	learn: 205.4498927	total: 1.89s	remaining: 1m 5s
28:	learn: 205.2770458	total: 1.94s	remaining: 1m 5s
29:	learn: 203.1950430	total: 2s	remaining: 1m 4s
30:	learn: 200.4576803	total: 2.07s	remaining: 1m 4s
31:	learn: 195.1857487	total: 2.13s	remaining: 1m 4s
32:	learn: 189.6041750	total: 2.2s	remaining: 1m 4s
33:	learn: 188.1822977	total: 2.27s	remaining: 1m 4s
34:	learn: 184.3363426	total: 2.37s	remaining: 1m 5s
35:	learn: 184.0403523	total: 2.51s	remaining: 1m 7s
36:	learn: 183.7326756	total: 2.58s	remaining: 1m 7s
37:	learn: 183.5675401	total: 2.64s	remaining: 1m 6s
38:	learn: 178.5058445	total: 2.7s	remaining: 1m 6s
39:	learn: 175.8689899	total: 2.76s	rema