In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
df = pd.read_csv(r'C:\Naveen\PLANT DISEASE (PAVITHRA)\DATASET\Disease with Weather.csv')
df

In [None]:
df.drop(columns = ['Disease in number'], inplace = True)

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
# Get min and max for each column
min_values = df.min()
max_values = df.max()

print("Minimum values for each column:")
print(min_values)

print("\nMaximum values for each column:")
print(max_values)

In [None]:
# Plotting histograms for all numerical columns
df.hist(figsize=(10, 8), bins=20)
plt.tight_layout()
plt.show()

In [None]:
# Boxplot to identify outliers
sns.boxplot(data=df[['Humidity', 'Wind Speed', 'Temperature', 'Wind Bearing', 'Visibility', 'Pressure']])
plt.show()

In [None]:
a = df.drop(columns = ['Disease'])
correlation_matrix = a.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
# Count plot for Disease column
sns.countplot(x='Disease', data=df)
plt.show()

In [None]:
sns.pairplot(df, hue="Disease")
plt.show()

In [None]:
df['Disease'].value_counts()

In [None]:
df_grouped = df.groupby('Disease').sum().reset_index()
df_grouped

In [None]:
df['Disease'] = df['Disease'].str.strip().str.lower()
df

In [None]:
df['Disease'].value_counts()

In [None]:
# Calculate z-scores to identify outliers
z_scores = stats.zscore(df[['Humidity', 'Wind Speed', 'Temperature', 'Wind Bearing', 'Visibility', 'Pressure']])
df_no_outliers = df[(abs(z_scores) < 3).all(axis=1)]  # Removing rows with outliers

In [None]:
df_no_outliers

In [None]:
# Separate the classes
df_majority = df_no_outliers[df_no_outliers['Disease'] == 'early blight']
df_minority = df_no_outliers[df_no_outliers['Disease'] == 'late blight']

In [None]:
# Random oversampling: upsample the minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,  # Sample with replacement to increase samples
                                 n_samples=len(df_majority),  # Match majority class size
                                 random_state=42)  # For reproducibility

In [None]:
# Combine the upsampled minority class with the majority class
df_resampled = pd.concat([df_majority, df_minority_upsampled])

In [None]:
# Check new class distribution
sns.countplot(x='Disease', data=df_resampled)
plt.show()

In [None]:
# Prepare the features (X) and the target variable (y)
X = df_resampled.drop('Disease', axis=1)
y = df_resampled['Disease']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
# Define the hyperparameters grid for GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear']
}

In [None]:
# GridSearchCV for RandomForest
rf_grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
rf_grid_search.fit(X_train_scaled, y_train)

In [None]:
# GridSearchCV for Logistic Regression
lr_grid_search = GridSearchCV(estimator=lr_model, param_grid=lr_param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
lr_grid_search.fit(X_train_scaled, y_train)

In [None]:
# Get the best models
best_rf_model = rf_grid_search.best_estimator_
best_lr_model = lr_grid_search.best_estimator_

In [None]:
# Predict on test set using best models
rf_pred = best_rf_model.predict(X_test_scaled)
lr_pred = best_lr_model.predict(X_test_scaled)

In [None]:
# Accuracy Scores
rf_accuracy = accuracy_score(y_test, rf_pred)
lr_accuracy = accuracy_score(y_test, lr_pred)

In [None]:
# Classification Reports
rf_class_report = classification_report(y_test, rf_pred)
lr_class_report = classification_report(y_test, lr_pred)

In [None]:
# Confusion Matrices
rf_conf_matrix = confusion_matrix(y_test, rf_pred)
lr_conf_matrix = confusion_matrix(y_test, lr_pred)

In [None]:
# Plot Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(rf_conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Healthy', 'Diseased'], yticklabels=['Healthy', 'Diseased'])
axes[0].set_title('Random Forest Confusion Matrix')

sns.heatmap(lr_conf_matrix, annot=True, fmt='d', cmap='Blues', ax=axes[1], 
            xticklabels=['Healthy', 'Diseased'], yticklabels=['Healthy', 'Diseased'])
axes[1].set_title('Logistic Regression Confusion Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Display Classification Reports
print("Random Forest Classification Report:\n", rf_class_report)
print("\nLogistic Regression Classification Report:\n", lr_class_report)

In [None]:
# Model Comparison Plot
model_accuracies = [rf_accuracy, lr_accuracy]
model_names = ['Random Forest', 'Logistic Regression']

In [None]:
plt.figure(figsize=(8, 6))
sns.barplot(x=model_names, y=model_accuracies, palette='Blues')
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.show()

In [None]:
# Save both models
joblib.dump(best_rf_model, 'random_forest_model.pkl')
joblib.dump(best_lr_model, 'logistic_regression_model.pkl')
joblib.dump(scaler, 'standard_scaler.pkl')

print(f"Both models and the scaler have been saved as 'random_forest_model.pkl', 'logistic_regression_model.pkl', and 'standard_scaler.pkl'")

In [None]:
df_minority

In [None]:
df_majority