In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
# Load datasets
# Replace file paths with your actual file paths if needed.
air_data = pd.read_csv('air_pollution_plant_data.csv')
plant_info = pd.read_csv('plant_info.csv')

# Check for missing values and handle if necessary
if air_data.isnull().sum().any():
    air_data = air_data.fillna(air_data.median())

# Assuming the air quality data has pollutant columns: 'CO', 'NO2', 'SO2', 'PM2.5', 'PM10'
# and a target column named 'plant' that has 4 classes (Snake Plant, Bamboo Palm, Spider Plant, Areca Palm)
features = ['CO', 'NO2', 'SO2', 'PM2.5', 'PM10']
target = 'plant'

# Verify that all required columns exist in the dataset
for feature in features:
    if feature not in air_data.columns:
        raise ValueError(f"Feature column '{feature}' not found in dataset")
if target not in air_data.columns:
    raise ValueError(f"Target column '{target}' not found in dataset")

X = air_data[features]
y = air_data[target]

# Split the data with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Build a pipeline with scaling and a Random Forest Classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

# Set up hyperparameter grid for GridSearchCV
param_grid = {
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 5, 10],
    'clf__min_samples_split': [2, 5],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# Fit model
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Evaluate model performance
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy: {:.2f}".format(acc))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Ensure plant_info has the required 'Plant' column
if 'Plant' not in plant_info.columns:
    raise ValueError("'Plant' column not found in plant_info dataset")

# Example of selecting plant info based on model prediction for a sample input
# Here, we assume you have a new sample (replace with actual data)
sample_input = np.array([[0.5, 30, 15, 80, 120]])  # dummy values; update as needed

# Reshape sample if needed and transform it with the same scaler used in the pipeline
sample_input_df = pd.DataFrame(sample_input, columns=features)
predicted_plant = best_model.predict(sample_input_df)[0]

# Verify that the predicted plant exists in the plant_info DataFrame
if predicted_plant not in plant_info['Plant'].values:
    print(f"Warning: Predicted plant '{predicted_plant}' not found in plant_info dataset")
    print("Available plants:", plant_info['Plant'].unique())
else:
    # Retrieve detailed plant info from plant_info DataFrame
    selected_plant_info = plant_info[plant_info['Plant'] == predicted_plant].iloc[0]
    print("\nRecommended Plant:", predicted_plant)
    print("Plant Info:", selected_plant_info.to_dict())

# Optionally, save the trained model for later use
# joblib.dump(best_model, 'plant_recommendation_model.pkl')