In [1]:
# summarize the number of unique values for each column using numpy
from pandas import read_csv
# load the dataset
df = read_csv('yield_prediction_dataset.csv')
# summarize the number of unique values in each column
print(df.nunique())

field_id           90
date_of_image      25
latitude           90
longitude          90
NDVI             1588
GNDVI            1588
NDWI             1588
SAVI             1588
soil_moisture    1588
temperature      1625
rainfall         1446
crop_type          30
yield            1198
Unnamed: 13         0
Unnamed: 14         0
dtype: int64


In [5]:
import pandas as pd

# Load your dataset
df = pd.read_csv("yield_prediction_dataset.csv")

# Unique values per column
print("Unique values per column:\n", df.nunique(), "\n")

# Duplicate rows count
print("Number of duplicate rows:", df.duplicated().sum(), "\n")

# Missing values per column
print("Missing values per column:\n", df.isnull().sum(), "\n")

# Detect outliers using the IQR method
outliers = {}
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    outlier_condition = (df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))
    outliers[column] = outlier_condition.sum()

print("Outliers detected per column:\n", outliers)


Unique values per column:
 field_id           90
date_of_image      25
latitude           90
longitude          90
NDVI             1588
GNDVI            1588
NDWI             1588
SAVI             1588
soil_moisture    1588
temperature      1625
rainfall         1446
crop_type          30
yield            1198
dtype: int64 

Number of duplicate rows: 0 

Missing values per column:
 field_id         0
date_of_image    0
latitude         0
longitude        0
NDVI             0
GNDVI            0
NDWI             0
SAVI             0
soil_moisture    0
temperature      0
rainfall         0
crop_type        0
yield            0
dtype: int64 

Outliers detected per column:
 {'latitude': 0, 'longitude': 106, 'NDVI': 33, 'GNDVI': 104, 'NDWI': 104, 'SAVI': 33, 'soil_moisture': 89, 'temperature': 0, 'rainfall': 15, 'yield': 3}


Significant outliers in:

longitude (106 values)

NDVI, GNDVI, NDWI, SAVI

soil_moisture (89 values)

rainfall (15 values)

yield (3 values)

In [43]:
import pandas as pd

# Load dataset
df = pd.read_csv('yield_prediction_dataset.csv')

# Drop non-numeric or irrelevant columns
df.drop(columns=['field_id', 'date_of_image'], inplace=True)

# One-hot encode the 'crop_type' feature
df = pd.get_dummies(df, columns=['crop_type'])

# Drop any column with all NaN values
df.dropna(axis=1, how='all', inplace=True)

# Fill remaining missing values with the mean (numeric columns only)
df.fillna(df.mean(numeric_only=True), inplace=True)

# Function to remove outliers using the IQR method
def remove_outliers_iqr(data):
    df_clean = data.copy()
    numeric_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]

    return df_clean

# Apply outlier removal
df_no_outliers = remove_outliers_iqr(df)

# Save cleaned data to new CSV file
df_no_outliers.to_csv('yield_prediction1.csv', index=False)

# Print summary
print("✅ Outliers removed and cleaned dataset saved to 'yield_prediction1.csv'")
print(f"Original shape: {df.shape}")
print(f"New shape:      {df_no_outliers.shape}")


✅ Outliers removed and cleaned dataset saved to 'yield_prediction1.csv'
Original shape: (1625, 40)
New shape:      (1389, 40)


In [47]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Function to detect outliers using IQR
def detect_outliers(df, columns):
    outliers = {}
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers[col] = ((df[col] < lower) | (df[col] > upper)).sum()
    return outliers

# Detect outliers in all numeric columns
numeric_columns = df.select_dtypes(include=["float64", "int64"]).columns
outlier_counts = detect_outliers(df, numeric_columns)

# Display the results
outlier_df = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["Outlier Count"])
outlier_df = outlier_df[outlier_df["Outlier Count"] > 0]  # Show only columns that still have outliers

print("=== Remaining Outliers in Cleaned Data ===")
print(outlier_df)


=== Remaining Outliers in Cleaned Data ===
           Outlier Count
longitude             92
GNDVI                  3
NDWI                   3


In [51]:
import pandas as pd

# Load existing cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Function to remove outliers from all numeric columns using IQR
def remove_all_outliers(df):
    df_clean = df.copy()
    numeric_cols = df_clean.select_dtypes(include=["float64", "int64"]).columns

    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    
    return df_clean

# Apply outlier removal
df_fully_cleaned = remove_all_outliers(df)

# Save the fully cleaned dataset (overwrite same file)
df_fully_cleaned.to_csv("yield_prediction1.csv", index=False)

# Summary
print("✅ All remaining outliers removed.")
print(f"New shape: {df_fully_cleaned.shape}")
print("✔️ File saved as 'yield_prediction1.csv'")


✅ All remaining outliers removed.
New shape: (1289, 40)
✔️ File saved as 'yield_prediction1.csv'


In [53]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Function to detect outliers using IQR
def detect_outliers(df, columns):
    outliers = {}
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers[col] = ((df[col] < lower) | (df[col] > upper)).sum()
    return outliers

# Detect outliers in all numeric columns
numeric_columns = df.select_dtypes(include=["float64", "int64"]).columns
outlier_counts = detect_outliers(df, numeric_columns)

# Display the results
outlier_df = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["Outlier Count"])
outlier_df = outlier_df[outlier_df["Outlier Count"] > 0]  # Show only columns that still have outliers

print("=== Remaining Outliers in Cleaned Data ===")
print(outlier_df)


=== Remaining Outliers in Cleaned Data ===
       Outlier Count
GNDVI              1
NDWI               1


In [55]:
import pandas as pd

# Load the current version of the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# List of columns to clean
columns_to_clean = ['GNDVI', 'NDWI']

# IQR-based removal for specific columns
for col in columns_to_clean:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# Save the updated cleaned dataset
df.to_csv("yield_prediction1.csv", index=False)

# Report
print("✅ Final outliers from GNDVI and NDWI removed.")
print(f"Final shape: {df.shape}")
print("✔️ File saved as 'yield_prediction1.csv'")


✅ Final outliers from GNDVI and NDWI removed.
Final shape: (1288, 40)
✔️ File saved as 'yield_prediction1.csv'


In [61]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Function to detect outliers using IQR
def detect_outliers(df, columns):
    outliers = {}
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        outliers[col] = ((df[col] < lower) | (df[col] > upper)).sum()
    return outliers

# Detect outliers in all numeric columns
numeric_columns = df.select_dtypes(include=["float64", "int64"]).columns
outlier_counts = detect_outliers(df, numeric_columns)

# Display the results
outlier_df = pd.DataFrame.from_dict(outlier_counts, orient="index", columns=["Outlier Count"])
outlier_df = outlier_df[outlier_df["Outlier Count"] > 0]  # Show only columns that still have outliers

print("=== Remaining Outliers in Cleaned Data ===")
print(outlier_df)
print(f"Final shape: {df.shape}")
# Save the updated cleaned dataset
df.to_csv("yield_prediction1.csv", index=False)

=== Remaining Outliers in Cleaned Data ===
Empty DataFrame
Columns: [Outlier Count]
Index: []
Final shape: (1288, 40)


## k-Nearest Neighbors-regression

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Instantiate KNN regressors
reg_01 = KNeighborsRegressor(n_neighbors=1)
reg_03 = KNeighborsRegressor(n_neighbors=3)
reg_09 = KNeighborsRegressor(n_neighbors=9)

# Fit the models
reg_01.fit(X_train, y_train)
reg_03.fit(X_train, y_train)
reg_09.fit(X_train, y_train)

# Predict and evaluate
results = {
    "Model": [],
    "R2 Score": [],
    "MSE": []
}

for model, label in zip([reg_01, reg_03, reg_09], ["KNN-1", "KNN-3", "KNN-9"]):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    results["Model"].append(label)
    results["R2 Score"].append(r2)
    results["MSE"].append(mse)

results_df = pd.DataFrame(results)
print("=== KNN Regression Results ===")
print(results_df)



=== KNN Regression Results ===
   Model  R2 Score        MSE
0  KNN-1  0.833883  10.838827
1  KNN-3  0.846104  10.041471
2  KNN-9  0.852813   9.603700


The KNN-9 model performs best in both metrics

In [5]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Instantiate models
reg_01 = KNeighborsRegressor(n_neighbors=1)
reg_03 = KNeighborsRegressor(n_neighbors=3)
reg_09 = KNeighborsRegressor(n_neighbors=9)

# Fit models
reg_01.fit(X_train, y_train)
reg_03.fit(X_train, y_train)
reg_09.fit(X_train, y_train)

# Print performance for each model
print("=== KNN Model Performance ===")
print("K: 01 Train set R^2: {:.2f}".format(reg_01.score(X_train, y_train)))
print("K: 01 Test set R^2: {:.2f}".format(reg_01.score(X_test, y_test)))
print("K: 03 Train set R^2: {:.2f}".format(reg_03.score(X_train, y_train)))
print("K: 03 Test set R^2: {:.2f}".format(reg_03.score(X_test, y_test)))
print("K: 09 Train set R^2: {:.2f}".format(reg_09.score(X_train, y_train)))
print("K: 09 Test set R^2: {:.2f}".format(reg_09.score(X_test, y_test)))


=== KNN Model Performance ===
K: 01 Train set R^2: 1.00
K: 01 Test set R^2: 0.83
K: 03 Train set R^2: 0.93
K: 03 Test set R^2: 0.85
K: 09 Train set R^2: 0.88
K: 09 Test set R^2: 0.85


## The KNN-9 model performs best in both metrics

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Apply StandardScaler to features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate models
reg_01 = KNeighborsRegressor(n_neighbors=1)
reg_03 = KNeighborsRegressor(n_neighbors=3)
reg_9 = KNeighborsRegressor(n_neighbors=9)

# Fit models using scaled data
reg_01.fit(X_train_scaled, y_train)
reg_03.fit(X_train_scaled, y_train)
reg_9.fit(X_train_scaled, y_train)

# Print performance for each model
print("=== KNN Model Performance (With Scaling) ===")
print("K: 01 Train set R^2: {:.2f}".format(reg_01.score(X_train_scaled, y_train)))
print("K: 01 Test set R^2: {:.2f}".format(reg_01.score(X_test_scaled, y_test)))
print("K: 03 Train set R^2: {:.2f}".format(reg_03.score(X_train_scaled, y_train)))
print("K: 03 Test set R^2: {:.2f}".format(reg_03.score(X_test_scaled, y_test)))
print("K: 09 Train set R^2: {:.2f}".format(reg_9.score(X_train_scaled, y_train)))
print("K: 09 Test set R^2: {:.2f}".format(reg_9.score(X_test_scaled, y_test)))


=== KNN Model Performance (With Scaling) ===
K: 01 Train set R^2: 1.00
K: 01 Test set R^2: 0.72
K: 03 Train set R^2: 0.89
K: 03 Test set R^2: 0.74
K: 09 Train set R^2: 0.68
K: 09 Test set R^2: 0.56


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Step 2: Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Step 4: Instantiate the models
lin_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)   # alpha=1.0 is default; can be tuned later
lasso_reg = Lasso(alpha=0.01)  # small alpha for Lasso; can also be tuned

# Step 5: Fit the models
lin_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)

# Step 6: Evaluate models
models = {
    "Linear Regression": lin_reg,
    "Ridge Regression": ridge_reg,
    "Lasso Regression": lasso_reg
}

# Collect results
results = {
    "Model": [],
    "Train R2": [],
    "Test R2": [],
    "Train MSE": [],
    "Test MSE": []
}

for name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    results["Model"].append(name)
    results["Train R2"].append(r2_score(y_train, y_train_pred))
    results["Test R2"].append(r2_score(y_test, y_test_pred))
    results["Train MSE"].append(mean_squared_error(y_train, y_train_pred))
    results["Test MSE"].append(mean_squared_error(y_test, y_test_pred))

# Step 7: Show results
results_df = pd.DataFrame(results)
print("=== Regression Models Evaluation without StandardScaler ===")
print(results_df)


=== Regression Models Evaluation without StandardScaler ===
               Model  Train R2   Test R2  Train MSE  Test MSE
0  Linear Regression  0.859463  0.844940   9.078636  9.580304
1   Ridge Regression  0.857931  0.846359   9.177605  9.492616
2   Lasso Regression  0.856189  0.845627   9.290143  9.537805


In [None]:
egression Models Evaluation with StandardScaler

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Step 1: Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Step 2: Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Step 3: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Step 4: Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 5: Instantiate the models
lin_reg = LinearRegression()
ridge_reg = Ridge(alpha=1.0)
lasso_reg = Lasso(alpha=0.01)

# Step 6: Fit the models
lin_reg.fit(X_train_scaled, y_train)
ridge_reg.fit(X_train_scaled, y_train)
lasso_reg.fit(X_train_scaled, y_train)

# Step 7: Evaluate models
models = {
    "Linear Regression": lin_reg,
    "Ridge Regression": ridge_reg,
    "Lasso Regression": lasso_reg
}

results = {
    "Model": [],
    "Train R2": [],
    "Test R2": [],
    "Train MSE": [],
    "Test MSE": []
}

for name, model in models.items():
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)
    
    results["Model"].append(name)
    results["Train R2"].append(r2_score(y_train, y_train_pred))
    results["Test R2"].append(r2_score(y_test, y_test_pred))
    results["Train MSE"].append(mean_squared_error(y_train, y_train_pred))
    results["Test MSE"].append(mean_squared_error(y_test, y_test_pred))

# Step 8: Show results
results_df = pd.DataFrame(results)
print("=== Scaled Regression Models Evaluation with StandardScaler ===")
print(results_df)


=== Scaled Regression Models Evaluation with StandardScaler ===
               Model  Train R2   Test R2  Train MSE  Test MSE
0  Linear Regression  0.859463  0.844952   9.078629  9.579513
1   Ridge Regression  0.858974  0.846346   9.110229  9.493386
2   Lasso Regression  0.858869  0.846712   9.117034  9.470766


In [None]:
 Support Vector Machines

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error

# Load your cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Instantiate SVR model with default parameters (kernel='rbf')
svr_model = SVR()

# Fit the model
svr_model.fit(X_train, y_train)

# Predict
y_train_pred = svr_model.predict(X_train)
y_test_pred = svr_model.predict(X_test)

# Evaluate
print("=== Support Vector Regression (Default) ===")
print("Train R2 Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R2 Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Support Vector Regression (Default) ===
Train R2 Score: 0.5484
Test R2 Score:  0.5368
Train MSE:      28.5476
Test MSE:       30.2205


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load the cleaned dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Apply StandardScaler to features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate SVR with default parameters
svr_model = SVR()

# Fit the model
svr_model.fit(X_train_scaled, y_train)

# Predict
y_train_pred = svr_model.predict(X_train_scaled)
y_test_pred = svr_model.predict(X_test_scaled)

# Evaluate
print("=== Support Vector Regression (With Scaling) ===")
print("Train R2 Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R2 Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Support Vector Regression (With Scaling) ===
Train R2 Score: 0.7885
Test R2 Score:  0.7718
Train MSE:      13.3691
Test MSE:       14.8910


In [None]:
SVR Grid Search 

In [None]:
1 - SVR Models by Test R² with Kernel

In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load the regression dataset
df = pd.read_csv("yield_prediction1.csv")

# Define features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Apply StandardScaler to features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=100)

# Parameter grid
kernels = ['linear']
C_values = [0.1, 1, 10]
gamma_values = [0.01, 0.1, 1]

# Collect results
results = []

for kernel in kernels:
    for C in C_values:
        for gamma in gamma_values:
            model = SVR(kernel=kernel, C=C, gamma=gamma)
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            results.append({
                "Kernel": kernel,
                "C": C,
                "Gamma": gamma,
                "Train R2": round(r2_score(y_train, y_train_pred), 4),
                "Test R2": round(r2_score(y_test, y_test_pred), 4),
                "Train MSE": round(mean_squared_error(y_train, y_train_pred), 4),
                "Test MSE": round(mean_squared_error(y_test, y_test_pred), 4)
            })


# Show the top 10 models
print("===  SVR Models by Test R² with linear ===")


results_df = pd.DataFrame(results)
results_df


===  SVR Models by Test R² with linear ===


Unnamed: 0,Kernel,C,Gamma,Train R2,Test R2,Train MSE,Test MSE
0,linear,0.1,0.01,0.8132,0.8298,11.6286,11.5287
1,linear,0.1,0.1,0.8132,0.8298,11.6286,11.5287
2,linear,0.1,1.0,0.8132,0.8298,11.6286,11.5287
3,linear,1.0,0.01,0.811,0.8287,11.7615,11.6095
4,linear,1.0,0.1,0.811,0.8287,11.7615,11.6095
5,linear,1.0,1.0,0.811,0.8287,11.7615,11.6095
6,linear,10.0,0.01,0.811,0.8286,11.7673,11.6116
7,linear,10.0,0.1,0.811,0.8286,11.7673,11.6116
8,linear,10.0,1.0,0.811,0.8286,11.7673,11.6116


In [None]:
2 - SVR Models by Test R² with sigmoid

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load the regression dataset
df = pd.read_csv("yield_prediction1.csv")

# Define features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Apply StandardScaler to features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=100)

# Parameter grid
kernels = ['sigmoid']
C_values = [0.1, 1, 10]
gamma_values = [0.01, 0.1, 1]

# Collect results
results = []

for kernel in kernels:
    for C in C_values:
        for gamma in gamma_values:
            model = SVR(kernel=kernel, C=C, gamma=gamma)
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            results.append({
                "Kernel": kernel,
                "C": C,
                "Gamma": gamma,
                "Train R2": round(r2_score(y_train, y_train_pred), 4),
                "Test R2": round(r2_score(y_test, y_test_pred), 4),
                "Train MSE": round(mean_squared_error(y_train, y_train_pred), 4),
                "Test MSE": round(mean_squared_error(y_test, y_test_pred), 4)
            })


# Show the top 10 models
print("===  SVR Models by Test R² with sigmoid ===")


results_df = pd.DataFrame(results)
results_df


===  SVR Models by Test R² with sigmoid ===


Unnamed: 0,Kernel,C,Gamma,Train R2,Test R2,Train MSE,Test MSE
0,sigmoid,0.1,0.01,0.2556,0.2372,46.3375,51.6803
1,sigmoid,0.1,0.1,0.7356,0.7448,16.4583,17.2937
2,sigmoid,0.1,1.0,-0.4152,-0.1932,88.0892,80.8473
3,sigmoid,1.0,0.01,0.8066,0.8036,12.0387,13.3093
4,sigmoid,1.0,0.1,-4.5906,-3.5807,347.9825,310.3667
5,sigmoid,1.0,1.0,-102.487,-85.1566,6441.5083,5837.5066
6,sigmoid,10.0,0.01,0.7926,0.8323,12.9092,11.3649
7,sigmoid,10.0,0.1,-571.1418,-464.0145,35612.7579,31506.8884
8,sigmoid,10.0,1.0,-9295.5682,-7988.4201,578661.4553,541320.2645


In [None]:
3 - SVR Models by Test R² with poly

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load the regression dataset
df = pd.read_csv("yield_prediction1.csv")

# Define features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Apply StandardScaler to features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=100)

# Parameter grid
kernels = ['poly']
C_values = [0.1, 1, 10]
gamma_values = [0.01, 0.1, 1]

# Collect results
results = []

for kernel in kernels:
    for C in C_values:
        for gamma in gamma_values:
            model = SVR(kernel=kernel, C=C, gamma=gamma)
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            results.append({
                "Kernel": kernel,
                "C": C,
                "Gamma": gamma,
                "Train R2": round(r2_score(y_train, y_train_pred), 4),
                "Test R2": round(r2_score(y_test, y_test_pred), 4),
                "Train MSE": round(mean_squared_error(y_train, y_train_pred), 4),
                "Test MSE": round(mean_squared_error(y_test, y_test_pred), 4)
            })


# Show the top 10 models
print("===  SVR Models by Test R² with poly ===")


results_df = pd.DataFrame(results)
results_df


===  SVR Models by Test R² with poly ===


Unnamed: 0,Kernel,C,Gamma,Train R2,Test R2,Train MSE,Test MSE
0,poly,0.1,0.01,0.0109,0.0015,61.5651,67.6533
1,poly,0.1,0.1,0.8437,0.7374,9.7297,17.7927
2,poly,0.1,1.0,0.9691,0.6527,1.9261,23.5291
3,poly,1.0,0.01,0.0986,0.0695,56.1057,63.0469
4,poly,1.0,0.1,0.903,0.8169,6.0361,12.4056
5,poly,1.0,1.0,0.9785,0.2246,1.3367,52.5344
6,poly,10.0,0.01,0.4506,0.3347,34.1957,45.0768
7,poly,10.0,0.1,0.9497,0.8113,3.1329,12.7831
8,poly,10.0,1.0,0.9792,-0.4952,1.2919,101.3039


In [None]:
RandomForestRegressor without StandardScaler

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

# Instantiate and fit RandomForestRegressor (no scaling)
rf_model = RandomForestRegressor(n_estimators=500, random_state=100)
rf_model.fit(X_train, y_train)

# Predict
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Evaluate
print("=== Random Forest Regressor Performance (No Scaling) ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Random Forest Regressor Performance (No Scaling) ===
Train R² Score: 0.9902
Test R² Score:  0.9394
Train MSE:      0.6006
Test MSE:       4.0840


In [None]:
Random Forest Regressor With StandardScaler

In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit model
rf_scaled = RandomForestRegressor(n_estimators=500, random_state=100)
rf_scaled.fit(X_train_scaled, y_train)

# Predict
y_train_pred = rf_scaled.predict(X_train_scaled)
y_test_pred = rf_scaled.predict(X_test_scaled)

# Evaluate
print("=== Random Forest Regressor (With StandardScaler) ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Random Forest Regressor (With StandardScaler) ===
Train R² Score: 0.9902
Test R² Score:  0.9394
Train MSE:      0.6006
Test MSE:       4.0843


In [None]:
Random Forest Code (with tuned parameters)

In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

# Optimized Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=1000,         # More trees
    max_depth=30,              # Deep trees for complexity
    min_samples_split=2,       # Default
    min_samples_leaf=1,        # Allow fine granularity
    max_features=1.0,          # Use all features
    bootstrap=True,
    random_state=100
)

# Train model
rf_model.fit(X_train, y_train)

# Predict
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Evaluate
print("=== Optimized Random Forest Regressor Performance ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Optimized Random Forest Regressor Performance ===
Train R² Score: 0.9902
Test R² Score:  0.9396
Train MSE:      0.6035
Test MSE:       4.0666


In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

# Refined Random Forest
rf_model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=40,
    min_samples_split=3,
    min_samples_leaf=1,
    max_features=0.8,         # 80% of features per split
    bootstrap=True,
    random_state=100,
    n_jobs=-1
)

# Fit and predict
rf_model.fit(X_train, y_train)
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

# Evaluate
print("=== Refined Random Forest Regressor Performance ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Refined Random Forest Regressor Performance ===
Train R² Score: 0.9889
Test R² Score:  0.9388
Train MSE:      0.6830
Test MSE:       4.1207


In [None]:
Decision Tree with StandardScaler

In [103]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Initialize and train the Decision Tree model
dt = DecisionTreeRegressor(random_state=100)
dt.fit(X_train, y_train)

# Predict
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

# Evaluate
print("=== Decision Tree Regressor (No Scaling) ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Decision Tree Regressor (No Scaling) ===
Train R² Score: 1.0000
Test R² Score:  0.9012
Train MSE:      0.0000
Test MSE:       6.5005


In [None]:
Decision Tree Without StandardScaler

In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Decision Tree model
dt_scaled = DecisionTreeRegressor(random_state=100)
dt_scaled.fit(X_train_scaled, y_train)

# Predict
y_train_pred = dt_scaled.predict(X_train_scaled)
y_test_pred = dt_scaled.predict(X_test_scaled)

# Evaluate
print("=== Decision Tree Regressor (With StandardScaler) ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Decision Tree Regressor (With StandardScaler) ===
Train R² Score: 1.0000
Test R² Score:  0.9012
Train MSE:      0.0000
Test MSE:       6.5005


In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Load dataset
df = pd.read_csv("yield_prediction1.csv")

# Separate features and target
X = df.drop("yield", axis=1)
y = df["yield"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Apply StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit model
rf_scaled = RandomForestRegressor(n_estimators=500, random_state=42)
rf_scaled.fit(X_train_scaled, y_train)

# Predict
y_train_pred = rf_scaled.predict(X_train_scaled)
y_test_pred = rf_scaled.predict(X_test_scaled)

# Evaluate
print("=== Random Forest Regressor (With StandardScaler) ===")
print("Train R² Score: {:.4f}".format(r2_score(y_train, y_train_pred)))
print("Test R² Score:  {:.4f}".format(r2_score(y_test, y_test_pred)))
print("Train MSE:      {:.4f}".format(mean_squared_error(y_train, y_train_pred)))
print("Test MSE:       {:.4f}".format(mean_squared_error(y_test, y_test_pred)))


=== Random Forest Regressor (With StandardScaler) ===
Train R² Score: 0.9926
Test R² Score:  0.9218
Train MSE:      0.4754
Test MSE:       4.8334
