In [None]:
# Import libraries

import pandas as pd 
import numpy as np 
import seaborn as sns 
from preprolib import myfunctions
import matplotlib.pyplot as plt 

In [None]:
# Importing the data
df = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Train.csv')
test = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Test.csv')
data_desc = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\VariableDescription.csv')

In [None]:
df.columns

In [None]:
# Example: Data Preprocessing
cat_cols = []
num_cols = []
ignore_list = ['ID','CropTillageDate', 'RcNursEstDate','Yield','SeedingSowingTransplanting','Harv_date','Threshing_date']

myfunctions.cat_or_num(df, ignore_list, num_cols, cat_cols)

In [None]:
cat_cols

In [None]:
label = 'Yield'
features = num_cols + cat_cols

In [None]:
features

In [None]:
num_plots = len(features)
fig, axes = plt.subplots(num_plots, 1, figsize=(10, 5*num_plots))

# Loop through the features list and plot histograms
for i, col in enumerate(features):
    ax = axes[i]
    if col in num_cols:
        # If it's a numerical column, plot a histogram
        sns.histplot(data=df, x=col, ax=ax, kde=True)
        ax.set_title(f'Histogram of {col} (Numerical)')
    else:
        # If it's a categorical column, plot a countplot
        sns.countplot(data=df, x=col, ax=ax)
        ax.set_title(f'Countplot of {col} (Categorical)')
    ax.set_xlabel('')

# Adjust layout and show the plots
plt.tight_layout()
plt.show()

In [None]:
# Convert specific columns to string type
df[cat_cols] = df[cat_cols].astype(str)


In [32]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



# Define transformers for numerical and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Custom transformer using onehot_seperator
from sklearn.base import BaseEstimator, TransformerMixin

class OneHotSeparator(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in self.columns:
            # Your implementation of onehot_seperator function
            # Convert specific columns to string type
            X[column] = X[column].astype(str)
            words = X[column].str.split()
            unique_words = set(word for word_list in words for word in word_list)
            for word in unique_words:
                X[word] = X[column].apply(lambda x: word in x)
                X[word] = X[word].astype(int)
            X = X.drop(column, axis=1)
        return X

categorical_transformer = Pipeline(steps=[
    ('onehot_separator', OneHotSeparator(columns=cat_cols))
])

# Create a column transformer that applies the transformers to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Create the full data preprocessing pipeline
data_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X = df[features]
y = df[label]
X_preprocessed = data_preprocessing_pipeline.fit_transform(X)

# Scale the target variable y
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Now you have X_train, X_test, y_train, and y_test for further processing

# Display a sample of the preprocessed data
print(X_train)

[[-0.60846419 -0.7047623   0.4516101  ...  1.          0.
   1.        ]
 [-0.28006002 -0.1688793  -1.31377483 ...  0.          0.
   1.        ]
 [-0.73982585 -0.66903677  0.4516101  ...  1.          0.
   1.        ]
 ...
 [-0.28006002 -0.1688793   1.33430256 ...  0.          0.
   1.        ]
 [-0.11585794 -0.1688793  -0.43108237 ...  0.          0.
   1.        ]
 [-0.11585794 -0.2760559  -1.31377483 ...  0.          0.
   1.        ]]


In [33]:
y_train

2181       4
416      600
1583     300
217     1300
888      800
        ... 
1130     200
1294       6
860      200
3507    1280
3174     900
Name: Yield, Length: 3096, dtype: int64

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'Neural Network': MLPRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(learning_rate=0.1),  # Adjust hyperparameters as needed

}

# Create a dictionary to store RMSE values
rmse_results = {}

# Iterate through the models and calculate RMSE
for model_name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store the RMSE in the dictionary
    rmse_results[model_name] = rmse





0:	learn: 523.5496596	total: 10.3ms	remaining: 10.3s
1:	learn: 498.8410326	total: 16.8ms	remaining: 8.38s
2:	learn: 479.1504993	total: 25.1ms	remaining: 8.34s
3:	learn: 460.6107067	total: 32.2ms	remaining: 8.01s
4:	learn: 445.3485476	total: 40.5ms	remaining: 8.05s
5:	learn: 432.3102355	total: 46.2ms	remaining: 7.65s
6:	learn: 421.5236167	total: 53.4ms	remaining: 7.57s
7:	learn: 411.5477569	total: 58.7ms	remaining: 7.28s
8:	learn: 401.4558930	total: 63.9ms	remaining: 7.04s
9:	learn: 393.5297960	total: 74.4ms	remaining: 7.36s
10:	learn: 385.6961050	total: 80.1ms	remaining: 7.2s
11:	learn: 379.2223512	total: 87ms	remaining: 7.16s
12:	learn: 371.4359045	total: 92.2ms	remaining: 7s
13:	learn: 364.6319527	total: 103ms	remaining: 7.22s
14:	learn: 358.4655502	total: 110ms	remaining: 7.19s
15:	learn: 354.6925303	total: 127ms	remaining: 7.8s
16:	learn: 349.4219102	total: 145ms	remaining: 8.39s
17:	learn: 345.5192031	total: 156ms	remaining: 8.54s
18:	learn: 341.5799374	total: 166ms	remaining: 8.5

In [41]:
# Print the RMSE for each model
for model_name, rmse in rmse_results.items():
    print(f'{model_name} RMSE: {rmse:.2f}')


Linear Regression RMSE: 2719412933326.84
Random Forest RMSE: 815.17
Decision Tree RMSE: 880.69
SVR RMSE: 953.28
K-Nearest Neighbors RMSE: 819.59
Gradient Boosting RMSE: 775.66
Ridge Regression RMSE: 797.26
Lasso Regression RMSE: 798.54
Elastic Net RMSE: 836.90
Bayesian Ridge RMSE: 799.58
Neural Network RMSE: 814.80
Gaussian Process RMSE: 1292.47
XGBoost RMSE: 823.63
LightGBM RMSE: 803.02
CatBoost RMSE: 769.10
