In [1]:
import pandas as pd
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load dataset and encode categorical features
dataset1 = pd.read_csv("pre_crop_yield.csv")  # Adjust the path if necessary
df2 = pd.get_dummies(dataset1, drop_first=True)

# Split data into features and target variable
indep_X = df2.drop('Yield_tons_per_hectare', axis=1)
dep_Y = df2['Yield_tons_per_hectare']

# Split and scale the data
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

# RFE for feature selection
def rfeFeature(indep_X, dep_Y, n_features):
    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    fit = rfe.fit(indep_X, dep_Y)
    selected_features = indep_X.columns[fit.support_]
    return selected_features

# SelectKBest for feature selection
def select_k_best_features(indep_X, dep_Y, n_features):
    selector = SelectKBest(score_func=f_regression, k=n_features)
    selector.fit(indep_X, dep_Y)
    selected_features = indep_X.columns[selector.get_support()]
    return selected_features

# Train a model and evaluate R2 score
def train_model(X_train, y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return r2_score(y_test, y_pred)

# Identify the top 3 features using RFE
top_features_rfe = rfeFeature(indep_X, dep_Y, 3)
print("Top 3 Features using RFE:", top_features_rfe)

# Identify the top 3 features using SelectKBest
top_features_kbest = select_k_best_features(indep_X, dep_Y, 3)
print("Top 3 Features using SelectKBest:", top_features_kbest)

# Split and scale the data for both feature sets
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = split_scalar(indep_X[top_features_rfe], dep_Y)
X_train_kbest, X_test_kbest, y_train_kbest, y_test_kbest = split_scalar(indep_X[top_features_kbest], dep_Y)

# Train and evaluate models on the RFE selected features
rfe_r2_score = train_model(X_train_rfe, y_train_rfe, X_test_rfe, y_test_rfe)
print(f"R2 Score on RFE selected features: {rfe_r2_score}")

# Train and evaluate models on the SelectKBest selected features
kbest_r2_score = train_model(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score on SelectKBest selected features: {kbest_r2_score}")


Top 3 Features using RFE: Index(['Temperature_Celsius', 'Fertilizer_Used', 'Irrigation_Used'], dtype='object')
Top 3 Features using SelectKBest: Index(['Rainfall_mm', 'Fertilizer_Used', 'Irrigation_Used'], dtype='object')
R2 Score on RFE selected features: 0.3278807581129656
R2 Score on SelectKBest selected features: 0.9056033219363863
