In [16]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

In [18]:
df_crop_production = pd.read_csv("../Datasets/OpDataset/crop_production_data.csv")
df_soil_analysis = pd.read_csv("../Datasets/OpDataset/soil_analysis_data.csv")
df_water_usage = pd.read_csv("../Datasets/OpDataset/water_usage_data.csv")

In [19]:
df_crop_production.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   District                  1000 non-null   object 
 1   Crop                      1000 non-null   object 
 2   Season                    1000 non-null   object 
 3   Area (hectares)           1000 non-null   float64
 4   Yield (quintals)          1000 non-null   float64
 5   Production (metric tons)  1000 non-null   float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


In [20]:
df_soil_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   District                    1000 non-null   object 
 1   Soil Type                   1000 non-null   object 
 2   pH Level                    1000 non-null   float64
 3   Organic Matter (%)          1000 non-null   float64
 4   Nitrogen Content (kg/ha)    1000 non-null   float64
 5   Phosphorus Content (kg/ha)  1000 non-null   float64
 6   Potassium Content (kg/ha)   1000 non-null   float64
dtypes: float64(5), object(2)
memory usage: 54.8+ KB


In [21]:
df_water_usage.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   District                             1000 non-null   object 
 1   Crop                                 1000 non-null   object 
 2   Irrigation Method                    1000 non-null   object 
 3   Water Consumption (liters/hectare)   1000 non-null   float64
 4   Water Availability (liters/hectare)  1000 non-null   float64
dtypes: float64(2), object(3)
memory usage: 39.2+ KB


In [22]:
merge_soil_crop_production = df_crop_production.merge(df_soil_analysis, on = ['District'])

In [23]:
merge_water_soil_crop_production = merge_soil_crop_production.merge(df_water_usage, on = ['District', 'Crop'])

In [24]:
database = merge_water_soil_crop_production.copy()
database.head()

Unnamed: 0,District,Crop,Season,Area (hectares),Yield (quintals),Production (metric tons),Soil Type,pH Level,Organic Matter (%),Nitrogen Content (kg/ha),Phosphorus Content (kg/ha),Potassium Content (kg/ha),Irrigation Method,Water Consumption (liters/hectare),Water Availability (liters/hectare)
0,Jodhpur,Wheat,Kharif,16490.142459,30.691986,5061.15223,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,9738.809833,10557.488279
1,Jodhpur,Wheat,Kharif,16490.142459,30.691986,5061.15223,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,13591.319448,14448.126887
2,Jodhpur,Wheat,Kharif,16490.142459,30.691986,5061.15223,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Drip Irrigation,8811.686999,8512.323275
3,Jodhpur,Wheat,Kharif,16490.142459,30.691986,5061.15223,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,8860.161716,9434.989877
4,Jodhpur,Wheat,Kharif,16490.142459,30.691986,5061.15223,Chalky (Calcareous),7.30574,2.095212,32.246094,17.450967,40.172205,Sprinkler Irrigation,9738.809833,10557.488279


In [25]:
database = database.drop(columns = ['Production (metric tons)', 'Water Consumption (liters/hectare)'], axis = 1)
database.head()

Unnamed: 0,District,Crop,Season,Area (hectares),Yield (quintals),Soil Type,pH Level,Organic Matter (%),Nitrogen Content (kg/ha),Phosphorus Content (kg/ha),Potassium Content (kg/ha),Irrigation Method,Water Availability (liters/hectare)
0,Jodhpur,Wheat,Kharif,16490.142459,30.691986,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,10557.488279
1,Jodhpur,Wheat,Kharif,16490.142459,30.691986,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,14448.126887
2,Jodhpur,Wheat,Kharif,16490.142459,30.691986,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Drip Irrigation,8512.323275
3,Jodhpur,Wheat,Kharif,16490.142459,30.691986,Sandy,7.453182,2.662898,23.564182,13.014409,37.082003,Sprinkler Irrigation,9434.989877
4,Jodhpur,Wheat,Kharif,16490.142459,30.691986,Chalky (Calcareous),7.30574,2.095212,32.246094,17.450967,40.172205,Sprinkler Irrigation,10557.488279


In [26]:
database.isnull().sum()

District                               0
Crop                                   0
Season                                 0
Area (hectares)                        0
Yield (quintals)                       0
Soil Type                              0
pH Level                               0
Organic Matter (%)                     0
Nitrogen Content (kg/ha)               0
Phosphorus Content (kg/ha)             0
Potassium Content (kg/ha)              0
Irrigation Method                      0
Water Availability (liters/hectare)    0
dtype: int64

In [27]:
database.duplicated().sum()

0

In [28]:
X = database.drop(columns=['Yield (quintals)'])  # Features
y = database['Yield (quintals)']  # Target

In [29]:
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['District', 'Crop', 'Season', 'Soil Type', 'Irrigation Method']])
X_categorical = pd.DataFrame(X_encoded.toarray(), columns=encoder.get_feature_names(['District', 'Crop', 'Season', 'Soil Type', 'Irrigation Method']))
X_numeric = X.drop(columns=['District', 'Crop', 'Season', 'Soil Type', 'Irrigation Method'])


In [33]:
cat_features = ['District', 'Crop', 'Season', 'Soil Type', 'Irrigation Method']
encoder = OneHotEncoder(drop='first', sparse=False)
X_train_encoded = encoder.fit_transform(X_train[cat_features])
X_val_encoded = encoder.transform(X_val[cat_features])
X_test_encoded = encoder.transform(X_test[cat_features])

In [34]:
num_features = [col for col in X.columns if col not in cat_features]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[num_features])
X_val_scaled = scaler.transform(X_val[num_features])
X_test_scaled = scaler.transform(X_test[num_features])

In [35]:
X_train_final = np.concatenate([X_train_scaled, X_train_encoded], axis=1)
X_val_final = np.concatenate([X_val_scaled, X_val_encoded], axis=1)
X_test_final = np.concatenate([X_test_scaled, X_test_encoded], axis=1)

In [45]:
# Reshape the input data for CNN (assuming 2D data)
input_shape = (X_train_final.shape[1], 1, 1)  # Shape of input features
X_train_reshaped = X_train_final.reshape(-1, *input_shape)  # Reshape for CNN
X_val_reshaped = X_val_final.reshape(-1, *input_shape)
X_test_reshaped = X_test_final.reshape(-1, *input_shape)

In [46]:
# Define the CNN mode

In [47]:
model = Sequential([
    Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1)  # Output layer for regression
])

ValueError: One of the dimensions in the output is <= 0 due to downsampling in conv2d_4. Consider increasing the input size. Received input shape [None, 49, 1, 1] which would produce output shape with a zero or negative value in a dimension.