In [None]:
# Import libraries

import pandas as pd 
import numpy as np 
import seaborn as sns 
from preprolib import myfunctions
import matplotlib.pyplot as plt 

In [None]:
# Importing the data
df = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Train.csv')
test = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Test.csv')
data_desc = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\VariableDescription.csv')

In [None]:
df.columns

In [None]:
# Example: Data Preprocessing
cat_cols = []
num_cols = []
ignore_list = ['ID','CropTillageDate', 'RcNursEstDate','Yield','SeedingSowingTransplanting','Harv_date','Threshing_date']

myfunctions.cat_or_num(df, ignore_list, num_cols, cat_cols)

In [None]:
import pandas as pd

# Assuming you have a DataFrame named 'df'
# 'cat_cols' is a list of categorical column names

# Create a list to store column names that meet the criteria
filtered_cat_cols = []

# Iterate through the categorical columns
for col in cat_cols:
    unique_values = df[col].nunique()  # Count the unique values in the column
    if unique_values <= 5:  # Check if there are 5 or fewer unique values
        filtered_cat_cols.append(col)

# Create a new DataFrame with the selected columns
filtered_cat_cols


In [None]:
label = 'Yield'
features = num_cols + filtered_cat_cols

In [None]:
features

In [None]:
num_plots = len(features)
fig, axes = plt.subplots(num_plots, 1, figsize=(10, 5*num_plots))

# Loop through the features list and plot histograms
for i, col in enumerate(features):
    ax = axes[i]
    if col in num_cols:
        # If it's a numerical column, plot a histogram
        sns.histplot(data=df, x=col, ax=ax, kde=True)
        ax.set_title(f'Histogram of {col} (Numerical)')
    else:
        # If it's a categorical column, plot a countplot
        sns.countplot(data=df, x=col, ax=ax)
        ax.set_title(f'Countplot of {col} (Categorical)')
    ax.set_xlabel('')

# Adjust layout and show the plots
plt.tight_layout()
plt.show()

### Analysis

In [None]:
import pandas as pd

# Assuming you have your DataFrame named 'data'
# Initialize a list to store columns with more than 1000 null values
columns_with_more_than_1000_nulls = []

for column in df[num_cols].columns:
    null_count = df[column].isnull().sum()
    if null_count > 1000:
        columns_with_more_than_1000_nulls.append(column)    

columns_with_more_than_1000_nulls

In [None]:
# Remove columns with more than 1000 null values from 'num_cols'
num_cols = [col for col in num_cols if col not in columns_with_more_than_1000_nulls]

# Remove columns not in test

Remove_features = [col for col in features if col not in test.columns]




In [None]:
test[features].columns

In [None]:
df[features].columns 

## Preprocessing

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Custom transformer using onehot_seperator
from sklearn.base import BaseEstimator, TransformerMixin


class OneHotSeparator(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.unique_words = set()

    def fit(self, X, y=None):
        # Extract unique words from the specified columns
        for column in self.columns:
            # Convert specific columns to string type
            X[column] = X[column].astype(str)
            words = X[column].str.split()
            unique_words = set(word for word_list in words for word in word_list)
            self.unique_words.update(unique_words)
        return self

    def transform(self, X):
        for word in self.unique_words:
            X[word] = X[self.columns].apply(lambda row: word in ' '.join(row), axis=1)
            X[word] = X[word].astype(int)
        X = X.drop(self.columns, axis=1)
        return X

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

# Create a column transformer that applies the transformers to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, filtered_cat_cols)
    ])

# Create the full data preprocessing pipeline
data_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X = df[features]
y = df[label]
X_preprocessed = data_preprocessing_pipeline.fit_transform(X)

test_filtered = test[features]
test_preprocessed = data_preprocessing_pipeline.fit_transform(test_filtered)

# Scale the target variable y
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Now you have X_train, X_test, y_train, and y_test for further processing with selected features

# Display a sample of the preprocessed data
print(X_train)


In [None]:
X.shape

In [None]:
test.shape

In [None]:
test_preprocessed

In [None]:
test.columns

In [None]:
df.columns

## Model Selection

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'Neural Network': MLPRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(learning_rate=0.1),  # Adjust hyperparameters as needed

}

# Create a dictionary to store RMSE values
rmse_results = {}

# Iterate through the models and calculate RMSE
for model_name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store the RMSE in the dictionary
    rmse_results[model_name] = rmse



In [None]:
# Print the RMSE for each model
model_scores = pd.DataFrame() 
for model_name, rmse in rmse_results.items():
    model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)


In [39]:
model_scores_sorted = model_scores.sort_values(by='RMSE', ascending=True)
model_scores_sorted

Unnamed: 0,Model,RMSE
14,CatBoost,793.267671
13,LightGBM,793.406869
1,Random Forest,794.571549
5,Gradient Boosting,798.001906
0,Linear Regression,804.668404
6,Ridge Regression,804.730173
7,Lasso Regression,805.034183
9,Bayesian Ridge,805.501626
10,Neural Network,815.594235
4,K-Nearest Neighbors,818.459137


In [None]:
gb_model = GradientBoostingRegressor(random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
gb_predictions = gb_model.predict(test_preprocessed)

In [None]:
gb_predictions

In [None]:
rf_model = RandomForestRegressor(random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
rf_predictions = rf_model.predict(test_preprocessed)

In [40]:
cb_model = CatBoostRegressor(learning_rate=0.1, random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
cb_predictions = cb_model.predict(test_preprocessed)

0:	learn: 521.2203204	total: 28.3ms	remaining: 28.3s
1:	learn: 498.0225976	total: 37.3ms	remaining: 18.6s
2:	learn: 478.5040884	total: 51.1ms	remaining: 17s
3:	learn: 460.9016802	total: 56ms	remaining: 13.9s
4:	learn: 446.5744316	total: 60ms	remaining: 11.9s
5:	learn: 432.7577572	total: 94.4ms	remaining: 15.6s
6:	learn: 421.0137875	total: 98.9ms	remaining: 14s
7:	learn: 410.8877244	total: 106ms	remaining: 13.2s
8:	learn: 401.4392044	total: 111ms	remaining: 12.3s
9:	learn: 393.7109856	total: 116ms	remaining: 11.4s
10:	learn: 385.9915490	total: 121ms	remaining: 10.8s
11:	learn: 378.6573096	total: 124ms	remaining: 10.2s
12:	learn: 372.3489644	total: 128ms	remaining: 9.71s
13:	learn: 367.4242784	total: 136ms	remaining: 9.6s
14:	learn: 363.2224891	total: 140ms	remaining: 9.19s
15:	learn: 359.2149810	total: 143ms	remaining: 8.8s
16:	learn: 356.2338218	total: 147ms	remaining: 8.48s
17:	learn: 352.7654056	total: 152ms	remaining: 8.27s
18:	learn: 349.5214679	total: 156ms	remaining: 8.05s
19:	le

In [41]:
test['Yield'] = cb_predictions



In [None]:
test

In [42]:
sub_cols = ['ID', 'Yield']
Submission = test[sub_cols]

In [43]:
Submission.to_csv('Submission.csv', index= False)