In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import logging

In [2]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [3]:
def load_and_preprocess_data(file_path):
    logging.info("Loading and preprocessing data...")
    df = pd.read_csv(file_path)
    X = df[['pH', 'Temp', 'Rain', 'Humidity', 'Nitrogen', 'Phosphorus', 'Potassium', 'Oxygen']].values
    y = df['Crop'].values
    return X, y, df

def split_data(X, y):
    logging.info("Splitting data into training and testing sets...")
    return train_test_split(X, y, test_size=0.2, random_state=42)

def scale_features(X_train, X_test):
    logging.info("Scaling features...")
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

def train_rf_model(X_train, y_train):
    logging.info("Training Random Forest model...")
    param_dist = {
        'n_estimators': [50, 100, 150],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    rf = RandomForestClassifier(random_state=42)
    random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                       n_iter=10, cv=3, n_jobs=-1, verbose=1, random_state=42)
    random_search.fit(X_train, y_train)
    logging.info(f"Best parameters: {random_search.best_params_}")
    return random_search.best_estimator_

def evaluate_model(model, X_test, y_test):
    logging.info("Evaluating model...")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    logging.info(f"Model accuracy: {accuracy:.4f}")
    print(accuracy)
    logging.info("Classification Report:\n" + report)
    return accuracy, report

def save_model(model, scaler, model_path, scaler_path):
    logging.info("Saving model and scaler...")
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    print("model and scaler saved successfully")

def predict_crop(model, scaler, new_data):
    logging.info("Predicting crop...")
    new_data_scaled = scaler.transform(new_data)
    prediction = model.predict(new_data_scaled)
    probabilities = model.predict_proba(new_data_scaled)[0]
    return prediction[0], probabilities

In [4]:
import psutil

def log_memory_usage():
    process = psutil.Process()
    memory_info = process.memory_info()
    logging.info(f"Memory usage: {memory_info.rss / (1024 * 1024):.2f} MB")

# In the main execution:
if __name__ == "__main__":
    data_file = 'all_trainable_data.csv'
    X, y, df = load_and_preprocess_data(data_file)
    X_train, X_test, y_train, y_test = split_data(X, y)
    X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)

    log_memory_usage()
    rf_model = train_rf_model(X_train_scaled, y_train)
    log_memory_usage()

    accuracy, report = evaluate_model(rf_model, X_test_scaled, y_test)

    print("\n" + "="*50)
    print(f"Model Accuracy: {accuracy:.4f}")
    print("="*50 + "\n")

    model_file = 'crop_model.joblib'
    scaler_file = 'crop_scaler.joblib'
    save_model(rf_model, scaler, model_file, scaler_file)

    # Example prediction
    new_data = np.array([[6.5, 25, 100, 70, 50, 30, 20, 80]])  # Example values
    predicted_crop, probabilities = predict_crop(rf_model, scaler, new_data)
    logging.info(f"Predicted crop: {predicted_crop}")
    logging.info(f"Prediction probabilities: {probabilities}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits




0.43077272727272725

Model Accuracy: 0.4308

model and scaler saved successfully
