In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib



from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [None]:
# Load the dataset
data_path = '/kaggle/input/greenhouse-plant-growth-metrics/Greenhouse Plant Growth Metrics.csv'
try:
    df = pd.read_csv(data_path, delimiter=',', encoding='ascii')
    print('Data loaded successfully.')
except Exception as e:
    print(f'Error loading data: {e}')

# Display the first few rows of the data (to be executed when running the notebook)
df.head()

In [None]:
df.info()

# Check for missing values
missing_values = df.isnull().sum()
print('Missing values in each column:')
print(missing_values)

# If missing values are found in numeric columns, we may choose to impute or remove observations
if missing_values.sum() > 0:
    # For the sake of this analysis, we'll fill numeric missing values with the column mean
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    # For object types, fill missing values with a placeholder 'Unknown'
    object_cols = df.select_dtypes(include=['object']).columns
    df[object_cols] = df[object_cols].fillna('Unknown')
    print('Missing values were imputed.')
else:
    print('No missing values found.')

# Verify data types
print('\nData types after cleaning:')
print(df.dtypes)

In [None]:
#encode categorical variables
df = pd.get_dummies(df, columns = ['Class'])

df.head()

In [None]:
#scale variables
from sklearn.preprocessing import MinMaxScaler

continuous = [
    var for var in df.columns if df[var].dtype.name == 'float64'

]

MinMax = MinMaxScaler()
df[continuous] = MinMax.fit_transform(X=df[continuous])


In [None]:
y = df['PHR'] #PHR represents plant growth rate
X = df.drop(columns=['PHR', 'Random'], errors='ignore')  # Exclude 'Random' if not informative


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train the Random Forest Classifier
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)

print(mse)

In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"R² score on test data: {r2:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='dodgerblue', label='Predicted vs Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Ideal: y = x')

plt.xlabel("Actual Values (y_test)")
plt.ylabel("Predicted Values (y_pred)")
plt.title("Actual vs Predicted Values")
plt.legend()
plt.grid(True)
plt.show()