In [1]:
import pandas as pd
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Load dataset and encode categorical features
dataset1 = pd.read_csv("pre_crop_yield.csv")  # Adjust the path if necessary
df2 = pd.get_dummies(dataset1, drop_first=True)

# Split data into features and target variable
indep_X = df2.drop('Yield_tons_per_hectare', axis=1)
dep_Y = df2['Yield_tons_per_hectare']

# Split and scale the data
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test

# RFE for feature selection
def rfeFeature(indep_X, dep_Y, n_features):
    model = LinearRegression()
    rfe = RFE(estimator=model, n_features_to_select=n_features)
    fit = rfe.fit(indep_X, dep_Y)
    selected_features = indep_X.columns[fit.support_]
    return selected_features

# SelectKBest for feature selection
def select_k_best_features(indep_X, dep_Y, n_features):
    selector = SelectKBest(score_func=f_regression, k=n_features)
    selector.fit(indep_X, dep_Y)
    selected_features = indep_X.columns[selector.get_support()]
    return selected_features

# Function to calculate r2 score
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

# Model functions
def Linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def svm_linear(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVR
    regressor = SVR(kernel='linear')
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def svm_NL(X_train, y_train, X_test, y_test):
    from sklearn.svm import SVR
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def Decision(X_train, y_train, X_test, y_test):
    from sklearn.tree import DecisionTreeRegressor
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

def random(X_train, y_train, X_test, y_test):
    from sklearn.ensemble import RandomForestRegressor
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    r2 = r2_prediction(regressor, X_test, y_test)
    return r2

# ---------------------------------------------
# Feature Selection and Model Training Section
# ---------------------------------------------

# Identify the top 3 features using SelectKBest
top_features_kbest = select_k_best_features(indep_X, dep_Y, 3)
print("Top 3 Features using SelectKBest:", list(top_features_kbest))

# Identify the top 3 features using RFE
top_features_rfe = rfeFeature(indep_X, dep_Y, 3)
print("Top 3 Features using RFE:", list(top_features_rfe))

# Split and scale the data for both feature sets
X_train_rfe, X_test_rfe, y_train_rfe, y_test_rfe = split_scalar(indep_X[top_features_rfe], dep_Y)
X_train_kbest, X_test_kbest, y_train_kbest, y_test_kbest = split_scalar(indep_X[top_features_kbest], dep_Y)

print("-------------------------------------------")
print("Train and evaluate models on the SelectKBest selected features")
# Train and evaluate models on the SelectKBest selected features
kbest_r2_score = Linear(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score using SelectKBest features linear: {kbest_r2_score:.4f}")

#kbest_r2_score = svm_linear(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
#print(f"R2 Score using SelectKBest features linear svm_linear: {kbest_r2_score:.4f}")

#kbest_r2_score = svm_NL(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
#print(f"R2 Score using SelectKBest features linear svm_non linear: {kbest_r2_score:.4f}")

kbest_r2_score = Decision(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score using SelectKBest features Decision: {kbest_r2_score:.4f}")

kbest_r2_score = random(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score using SelectKBest features random: {kbest_r2_score:.4f}")

print("-------------------------------------------")
print("Train and evaluate models on the RFE selected features")
# Train and evaluate models on the RFE selected features
rfe_r2_score = Linear(X_train_rfe, y_train_rfe, X_test_rfe, y_test_rfe)
print(f"R2 Score using RFE selected features linear: {rfe_r2_score:.4f}")

kbest_r2_score = Decision(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score using RFE selected features Decision: {rfe_r2_score:.4f}")

kbest_r2_score = random(X_train_kbest, y_train_kbest, X_test_kbest, y_test_kbest)
print(f"R2 Score using RFE selected features random: {rfe_r2_score:.4f}")


Top 3 Features using SelectKBest: ['Rainfall_mm', 'Fertilizer_Used', 'Irrigation_Used']
Top 3 Features using RFE: ['Temperature_Celsius', 'Fertilizer_Used', 'Irrigation_Used']
-------------------------------------------
Train and evaluate models on the SelectKBest selected features
R2 Score using SelectKBest features linear: 0.9056
R2 Score using SelectKBest features Decision: 0.8116
R2 Score using SelectKBest features random: 0.8568
-------------------------------------------
Train and evaluate models on the RFE selected features
R2 Score using RFE selected features linear: 0.3279
R2 Score using RFE selected features Decision: 0.3279
R2 Score using RFE selected features random: 0.3279
