In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/innovative-ai-challenge-2024/sample_submission.csv
/kaggle/input/innovative-ai-challenge-2024/train.csv
/kaggle/input/innovative-ai-challenge-2024/test.csv


## Initialization

In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.preprocessing import MaxAbsScaler

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [3]:
df = pd.read_csv('/kaggle/input/innovative-ai-challenge-2024/train.csv')

In [4]:
df.rename(columns={'Crop_Yield (kg/ha)': 'cy'}, inplace=True)

In [5]:
df.head()

Unnamed: 0,id,Year,State,Crop_Type,Rainfall,Soil_Type,Irrigation_Area,cy
0,1,2019,Punjab,Wheat,578.6,Loamy,3515.2,5188
1,2,2018,Punjab,Wheat,598.3,Loamy,3499.3,5077
2,3,2017,Punjab,Wheat,493.0,Loamy,3467.7,5046
3,4,2016,Punjab,Wheat,426.7,Loamy,3474.6,4583
4,5,2015,Punjab,Wheat,546.9,Loamy,3474.7,4304


In [6]:
df.shape

(55, 8)

In [7]:
df.describe()

Unnamed: 0,id,Year,Rainfall,Irrigation_Area,cy
count,55.0,55.0,55.0,55.0,55.0
mean,28.0,2009.527273,473.881818,2082.207273,3079.418182
std,16.02082,6.394021,106.83676,1495.190498,1706.608372
min,1.0,2000.0,218.9,1.2,0.0
25%,14.5,2004.0,391.9,5.5,985.5
50%,28.0,2009.0,459.5,2721.8,3943.0
75%,41.5,2015.0,561.4,3393.25,4305.5
max,55.0,2021.0,662.8,3515.2,5188.0


In [8]:
df.isna().sum()

id                 0
Year               0
State              0
Crop_Type          0
Rainfall           0
Soil_Type          0
Irrigation_Area    0
cy                 0
dtype: int64

In [9]:
# Category columns
categorical_cols = list(df.select_dtypes(include=['object']).columns)[1:]
categorical_cols

['Crop_Type', 'Soil_Type']

In [10]:
numerical_cols = list(df.select_dtypes(exclude='object').columns)[2:4]
numerical_cols

['Rainfall', 'Irrigation_Area']

## Preprocessing 

In [11]:
def apply_transformations(df, categorical_cols, numerical_cols):
    """
    Apply transformations on the DataFrame:
    - OneHotEncoding for categorical columns (including 'Year', 'Crop_Type', 'Soil_Type')
    - StandardScaling for numerical columns ('Rainfall', 'Irrigation_Area')
    
    Parameters:
    df (pd.DataFrame): Input DataFrame to preprocess
    categorical_cols (list): List of categorical column names
    numerical_cols (list): List of numerical column names
    
    Returns:
    pd.DataFrame: Transformed DataFrame ready for model building
    """
    # Step 1: OneHotEncode categorical columns
    encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop first to avoid dummy variable trap
    categorical_data = encoder.fit_transform(df[categorical_cols])
    categorical_df = pd.DataFrame(categorical_data, columns=encoder.get_feature_names_out(categorical_cols))
    
    numerical_df = df.select_dtypes(include='float64')
    # Concatenate the processed categorical and numerical data
    df_transformed = pd.concat([categorical_df, numerical_df], axis=1)

    return df_transformed


In [12]:
df_transformed = apply_transformations(df.drop(columns=['Year','id','State','cy']), categorical_cols, numerical_cols)
year_encoder = LabelEncoder()
df_transformed['Year_Encoded'] = year_encoder.fit_transform(df['Year'])
df_transformed['cy']=df['cy']

In [13]:
df_transformed

Unnamed: 0,Crop_Type_Rice,Crop_Type_Wheat,Soil_Type_alluvial,Rainfall,Irrigation_Area,Year_Encoded,cy
0,0.0,1.0,0.0,578.6,3515.2,18,5188
1,0.0,1.0,0.0,598.3,3499.3,17,5077
2,0.0,1.0,0.0,493.0,3467.7,16,5046
3,0.0,1.0,0.0,426.7,3474.6,15,4583
4,0.0,1.0,0.0,546.9,3474.7,14,4304
5,0.0,1.0,0.0,384.9,3474.7,13,5017
6,0.0,1.0,0.0,619.7,3488.1,12,4724
7,0.0,1.0,0.0,218.9,3466.9,11,4693
8,0.0,1.0,0.0,472.1,3474.8,10,4307
9,0.0,1.0,0.0,384.9,3474.8,9,4462


In [14]:
df_transformed[df_transformed['Irrigation_Area']<4]['cy'].mean()

423.125

In [15]:
df_transformed.loc[42,'cy']=df_transformed[df_transformed['Irrigation_Area']<4]['cy'].mean()
df_transformed.loc[43,'cy']=df_transformed[df_transformed['Irrigation_Area']<4]['cy'].mean()

  df_transformed.loc[42,'cy']=df_transformed[df_transformed['Irrigation_Area']<4]['cy'].mean()


## Apply Transformations

In [16]:
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import pylab
def plot_data(feature):
    print(df_transformed['cy'].corr(feature))
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    feature.hist()
    plt.subplot(1,2,2)
    stats.probplot(feature,dist='norm',plot=pylab)
    plt.show

In [17]:
# plot_data(df_transformed['Rainfall'])

In [18]:
# log_trans = np.log(df_transformed['Rainfall'])
# plot_data(log_trans)

In [19]:
# rec_trans = 1/df_transformed['Rainfall']
# plot_data(rec_trans)

In [20]:
# rec_trans = np.exp(-df_transformed['Rainfall'])
# plot_data(rec_trans)

## Visualize

In [21]:
df_transformed.corr()

Unnamed: 0,Crop_Type_Rice,Crop_Type_Wheat,Soil_Type_alluvial,Rainfall,Irrigation_Area,Year_Encoded,cy
Crop_Type_Rice,1.0,-0.506712,1.0,-0.058922,0.309413,-0.038916,0.318386
Crop_Type_Wheat,-0.506712,1.0,-0.506712,-0.044952,0.660013,-0.020081,0.641352
Soil_Type_alluvial,1.0,-0.506712,1.0,-0.058922,0.309413,-0.038916,0.318386
Rainfall,-0.058922,-0.044952,-0.058922,1.0,-0.085025,0.294149,-0.092167
Irrigation_Area,0.309413,0.660013,0.309413,-0.085025,1.0,-0.002644,0.986994
Year_Encoded,-0.038916,-0.020081,-0.038916,0.294149,-0.002644,1.0,-0.017952
cy,0.318386,0.641352,0.318386,-0.092167,0.986994,-0.017952,1.0


## Model Building

## Train Test split

In [22]:
X_final=df_transformed.drop(columns=['cy'])
y=df_transformed['cy']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X_final,y,test_size=0.1,random_state=50)

## Model Testing Function

In [24]:
def test_model(model):
    # Predict and Evaluate
    y_pred = model.predict(X_test)
    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print Results
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    print("R² Score:", r2)

## Random Forest Regressor

In [25]:
from sklearn.ensemble import RandomForestRegressor

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=37,max_depth=7,random_state=2)
rf_model.fit(X_train, y_train)

# Test the model
test_model(rf_model)

Mean Squared Error (MSE): 12685.561099267674
Mean Absolute Error (MAE): 105.16293637387382
R² Score: 0.9967487413651405


## Gradient Boosting

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

# Train Gradient Boosting Model
gb_model = GradientBoostingRegressor(n_estimators=300,
                                     learning_rate=0.2,
                                     max_depth=4,
                                     min_samples_split=3, 
                                     min_samples_leaf=4)
gb_model.fit(X_train, y_train)

# Test the model
test_model(gb_model)

Mean Squared Error (MSE): 21825.52277226302
Mean Absolute Error (MAE): 123.24567481604883
R² Score: 0.9944062056996643


## Extra Trees Regressor

In [27]:
from sklearn.ensemble import ExtraTreesRegressor

# Create an ExtraTreesRegressor with default parameters
et_model = ExtraTreesRegressor(n_estimators=30,random_state=53)

# Train the model
et_model.fit(X_train, y_train)

test_model(et_model)

Mean Squared Error (MSE): 13428.109916268813
Mean Absolute Error (MAE): 101.40156250000003
R² Score: 0.9965584290695954


## HP Tuning

In [28]:
# losses=[]
# for i in range(1,60):
#     et_model = ExtraTreesRegressor(n_estimators=30,random_state=i)
#     et_model.fit(X_train, y_train)
#     y_pred = et_model.predict(X_test)
#     mse = mean_squared_error(y_test, y_pred)
#     losses.append(mse)
# print(losses.index(min(losses)),min(losses))

In [29]:
def ensemble_model():
    # Predict and Evaluate
    y1 = gb_model.predict(X_test)
    y2 = et_model.predict(X_test)
    y_pred = ((0.8 * y1) + (0.2 * y2))
    
    # Metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Print Results
    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    print("R² Score:", r2)

ensemble_model()

Mean Squared Error (MSE): 18034.1449644955
Mean Absolute Error (MAE): 108.2313554896674
R² Score: 0.9953779206864165


## Test Data

In [30]:
test_df = pd.read_csv('/kaggle/input/innovative-ai-challenge-2024/test.csv')

In [31]:
test_df.shape

(8, 7)

In [32]:
test_df

Unnamed: 0,id,Year,State,Crop_Type,Rainfall,Soil_Type,Irrigation_Area
0,1001,2021,Punjab,Wheat,556.9,Loamy,3500.7
1,1002,2020,Punjab,Wheat,602.6,Loamy,3509.5
2,1003,2019,Punjab,Rice,578.6,alluvial,3091.8
3,1004,2018,Punjab,Rice,598.3,alluvial,3057.9
4,1005,2017,Punjab,Rice,493.0,alluvial,3033.0
5,1006,2014,Punjab,Bajra,384.9,Loamy,1.2
6,1007,2013,Punjab,Bajra,619.7,Loamy,3.81
7,1008,2011,Punjab,Bajra,218.9,Loamy,2.5


In [33]:
# Apply the preprocessing to the training data
test_df_transformed = apply_transformations(test_df.drop(columns=['id','State']), categorical_cols, numerical_cols)
test_df_transformed

Unnamed: 0,Crop_Type_Rice,Crop_Type_Wheat,Soil_Type_alluvial,Rainfall,Irrigation_Area
0,0.0,1.0,0.0,556.9,3500.7
1,0.0,1.0,0.0,602.6,3509.5
2,1.0,0.0,1.0,578.6,3091.8
3,1.0,0.0,1.0,598.3,3057.9
4,1.0,0.0,1.0,493.0,3033.0
5,0.0,0.0,0.0,384.9,1.2
6,0.0,0.0,0.0,619.7,3.81
7,0.0,0.0,0.0,218.9,2.5


In [34]:
test_df_transformed['Year_Encoded'] = year_encoder.transform(test_df['Year'])

In [35]:
test_df_transformed

Unnamed: 0,Crop_Type_Rice,Crop_Type_Wheat,Soil_Type_alluvial,Rainfall,Irrigation_Area,Year_Encoded
0,0.0,1.0,0.0,556.9,3500.7,20
1,0.0,1.0,0.0,602.6,3509.5,19
2,1.0,0.0,1.0,578.6,3091.8,18
3,1.0,0.0,1.0,598.3,3057.9,17
4,1.0,0.0,1.0,493.0,3033.0,16
5,0.0,0.0,0.0,384.9,1.2,13
6,0.0,0.0,0.0,619.7,3.81,12
7,0.0,0.0,0.0,218.9,2.5,11


In [36]:
gb_model.fit(X_final,y)
rf_model.fit(X_final,y)

y1 = gb_model.predict(test_df_transformed)
y2 = rf_model.predict(test_df_transformed)
y_pred = ((0.8 * y1) + (0.2 * y2))

In [37]:
if y_pred[5]<0:
    y_pred[5]=0

In [38]:
submission_df = pd.DataFrame({
    'id': test_df['id'], 
    'Target': y_pred
})

submission_df

Unnamed: 0,id,Target
0,1001,4867.464597
1,1002,5026.769258
2,1003,4154.116243
3,1004,4125.82351
4,1005,4196.055462
5,1006,433.057309
6,1007,937.520296
7,1008,940.989608


In [39]:
submission_df.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
