In [None]:
# Import libraries

import pandas as pd 
import numpy as np 
import seaborn as sns 
from preprolib import myfunctions
import matplotlib.pyplot as plt 

In [None]:
# Importing the data
df = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Train.csv')
test = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\Test.csv')
data_desc = pd.read_csv(r'C:\Users\User\Desktop\Projects\Green Crop Yield\data\VariableDescription.csv')

In [None]:
df.columns

In [None]:
# Example: Data Preprocessing
cat_cols = []
num_cols = []
ignore_list = ['ID','CropTillageDate', 'RcNursEstDate','Yield','SeedingSowingTransplanting','Harv_date','Threshing_date']

myfunctions.cat_or_num(df, ignore_list, num_cols, cat_cols)

In [None]:
import pandas as pd

# Assuming you have a DataFrame named 'df'
# 'cat_cols' is a list of categorical column names

# Create a list to store column names that meet the criteria
filtered_cat_cols = []

# Iterate through the categorical columns
for col in cat_cols:
    unique_values = df[col].nunique()  # Count the unique values in the column
    if unique_values <= 5:  # Check if there are 5 or fewer unique values
        filtered_cat_cols.append(col)

# Create a new DataFrame with the selected columns
filtered_cat_cols


In [None]:
label = 'Yield'
features = num_cols + filtered_cat_cols

In [None]:
features

### Analysis

In [None]:
num_plots = len(features)
fig, axes = plt.subplots(num_plots, 1, figsize=(10, 5*num_plots))

# Loop through the features list and plot histograms
for i, col in enumerate(cat_cols):
    ax = axes[i]
    if col in num_cols:
        # If it's a numerical column, plot a histogram
        sns.histplot(data=df, x=col, ax=ax, kde=True)
        ax.set_title(f'Histogram of {col} (Numerical)')
    else:
        # If it's a categorical column, plot a countplot
        sns.countplot(data=df, x=col, ax=ax)
        ax.set_title(f'Countplot of {col} (Categorical)')
    ax.set_xlabel('')

# Adjust layout and show the plots
plt.tight_layout()
plt.show()

In [None]:
sep_cols = ['CropbasalFerts', 'OrgFertilizers', 'TransDetFactor', 'NursDetFactor', 'LandPreparationMethod']

In [None]:
sep_cols

In [None]:
import pandas as pd

# Assuming you have your DataFrame named 'data'
# Initialize a list to store columns with more than 1000 null values
columns_with_more_than_1000_nulls = []

for column in df[num_cols].columns:
    null_count = df[column].isnull().sum()
    if null_count > 1000:
        columns_with_more_than_1000_nulls.append(column)    

columns_with_more_than_1000_nulls

In [None]:
df.loc[:,sep_cols]

In [None]:
def onehot_seperator(df, columns):
    unique_words = set()  # Initialize an empty set to store unique words
    
    for column in columns:
        # Split the column values into words
        df[column] = df[column].astype(str)
        words = df[column].str.split()

        # Create a set of unique words for this column, excluding NaN
        unique_words.update(word for word_list in words if word_list is not None for word in word_list)

    return unique_words

unique_words = onehot_seperator(df, sep_cols)

In [None]:
import pandas as pd

def onehot_seperator(df, columns, test=None):
    for column in columns:
        # Check if the column contains non-null string values
        if df[column].dtype == 'object':
            # Split the column values into words
            df[column] = df[column].astype(str)
            words = df[column].str.split()

            # Create a set of unique words
            unique_words = set(word for word_list in words for word in word_list)

            # Create binary columns for each unique word in the original dataframe
            for word in unique_words:
                df[word] = df[column].apply(lambda x: word in x)
                df[word] = df[word].replace({True: 1, False: 0})

            # If a test dataframe is provided, create the same columns in the test dataframe
            if test is not None:
                for word in unique_words:
                    if test[column].dtype == 'object':
                        test[column] = test[column].astype(str)
                        test[word] = test[column].apply(lambda x: word in x)
                        test[word] = test[word].astype(int)

    return df, test

df, test = onehot_seperator(df, sep_cols, test)


In [None]:
# Remove columns with more than 1000 null values from 'num_cols'
num_cols = [col for col in num_cols if col not in columns_with_more_than_1000_nulls]

#TODO : Conduct PCA, Mutual information analysis and categorical separation for feature selection.




In [None]:
features.extend(unique_words)


## Preprocessing

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression

from sklearn.decomposition import PCA

# Load your dataset into a DataFrame df

# Define the columns for PCA
pca_columns = num_cols


# Create transformers for PCA columns
pca_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10))  # You can adjust the number of components
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Custom transformer using onehot_seperator
from sklearn.base import BaseEstimator, TransformerMixin


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

# Create a column transformer that applies the transformers to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('pca', pca_transformer, pca_columns),
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, filtered_cat_cols)
    ])

# Create the full data preprocessing pipeline
data_preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the data
X = df[features]
y = df[label]
X_preprocessed = data_preprocessing_pipeline.fit_transform(X)

test_filtered = test[features]
test_preprocessed = data_preprocessing_pipeline.fit_transform(test_filtered)

# Scale the target variable y
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Now you have X_train, X_test, y_train, and y_test for further processing with selected features

# Display a sample of the preprocessed data
print(X_train)


## Model Selection

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import BayesianRidge
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Elastic Net': ElasticNet(),
    'Bayesian Ridge': BayesianRidge(),
    'Neural Network': MLPRegressor(),
    'Gaussian Process': GaussianProcessRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(learning_rate=0.1),  # Adjust hyperparameters as needed

}

# Create a dictionary to store RMSE values
rmse_results = {}

# Iterate through the models and calculate RMSE
for model_name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Store the RMSE in the dictionary
    rmse_results[model_name] = rmse



In [52]:
# Print the RMSE for each model
model_scores = pd.DataFrame() 
for model_name, rmse in rmse_results.items():
    model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)


  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rmse}, ignore_index=True)
  model_scores = model_scores.append({'Model': model_name, 'RMSE': rms

In [53]:
model_scores_sorted = model_scores.sort_values(by='RMSE', ascending=True)
model_scores_sorted

Unnamed: 0,Model,RMSE
5,Gradient Boosting,779.545978
1,Random Forest,784.963579
13,LightGBM,788.486339
14,CatBoost,791.293981
12,XGBoost,794.026764
0,Linear Regression,804.550047
6,Ridge Regression,804.72897
7,Lasso Regression,805.346067
9,Bayesian Ridge,805.695015
10,Neural Network,810.093015


In [55]:
gb_model = GradientBoostingRegressor(random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
gb_predictions = gb_model.predict(test_preprocessed)

In [56]:
gb_predictions

array([574.19673864, 390.71315085, 513.68163805, ..., 241.51720092,
       387.43861496, 316.7051958 ])

In [None]:
rf_model = RandomForestRegressor(random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
rf_predictions = rf_model.predict(test_preprocessed)

In [None]:
cb_model = CatBoostRegressor(learning_rate=0.1, random_state=1).fit(X_train, y_train)  # Convert one-hot encoded y_train to 1D array
cb_predictions = cb_model.predict(test_preprocessed)

In [58]:
test['Yield'] = gb_predictions



In [None]:
test

In [59]:
sub_cols = ['ID', 'Yield']
Submission = test[sub_cols]

In [60]:
Submission.to_csv('Submission.csv', index= False)