In [15]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score)
import joblib



In [28]:

# Set default figure background color to white
plt.rcParams['figure.facecolor'] = 'white'

# Set font family to Times New Roman
plt.rcParams['font.family'] = 'Times New Roman'

# Load the TSV file into a pandas dataframe
data = pd.read_csv('leaveoneout_encoded.csv')

# Split the data into features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define a function to plot confusion matrix
def plot_cm(model):
    cm = confusion_matrix(y_test, model.predict(X_test))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f'{type(model).__name__} Confusion Matrix', fontdict={'family': 'Times New Roman'})
    if not os.path.exists('figures'):
        os.makedirs('figures')
    plt.savefig(f'figures/{type(model).__name__}_confusion_matrix.png', dpi=600, bbox_inches='tight')
    plt.close()

# Define a function to plot ROC curve
def plot_roc(model):
    y_pred_prob = model.predict_proba(X_test)
    fpr, tpr, roc_auc = {}, {}, {}
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    for i in range(n_classes):
        plt.plot(fpr[i], tpr[i], label=f'Class {i}: AUC = {roc_auc[i]:.2f}')
    plt.xlabel('False Positive Rate', fontdict={'family': 'Times New Roman'})
    plt.ylabel('True Positive Rate', fontdict={'family': 'Times New Roman'})
    plt.title(f'{type(model).__name__} ROC Curve', fontdict={'family': 'Times New Roman'})
    plt.legend(loc="lower right")
    if not os.path.exists('figures'):
        os.makedirs('figures')
    plt.savefig(f'figures/{type(model).__name__}_roc_curve.png', dpi=600, bbox_inches='tight')
    plt.close()

# Prepare a list to store the results
results = []

# Models list
models = [
    RandomForestClassifier(random_state=42)
    # Add more models here if needed
]

# Compute the number of unique classes in the target variable for ROC plotting
n_classes = len(np.unique(y_test))
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))

# Train models, evaluate and plot
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Store results
    results.append({
        'Model': type(model).__name__,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })
    
    # Plot and save confusion matrix
    plot_cm(model)
    # Plot and save ROC curve
    plot_roc(model)

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)


In [29]:
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# At the end of the loop that calculates the metrics and generates plots
print("Metrics computed and plots generated for each model.")
print("\nPerformance Metrics:")
print(results_df)

# Write results to a file and read back to print
results_df.to_csv('performance.tab', sep='\t', index=False)
print("\nPerformance metrics saved to 'performance.tab'. Contents of this file:")
print(pd.read_csv('performance.tab', delimiter='\t'))

# Check if figures were generated
if os.path.exists('figures'):
    print("\nList of figures saved:")
    print(os.listdir('figures'))
else:
    print("\nNo figures were saved.")


Metrics computed and plots generated for each model.

Performance Metrics:
                    Model  Accuracy  Precision    Recall  F1 Score
0  RandomForestClassifier  0.984848   0.987879  0.984848  0.985361

Performance metrics saved to 'performance.tab'. Contents of this file:
                    Model  Accuracy  Precision    Recall  F1 Score
0  RandomForestClassifier  0.984848   0.987879  0.984848  0.985361

List of figures saved:
['RandomForestClassifier_confusion_matrix.png', 'RandomForestClassifier_roc_curve.png']


In [30]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for grid search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_rf_model = grid_search.best_estimator_

# Save the best model
best_model_filename = 'best_random_forest_model.pkl'
joblib.dump(best_rf_model, best_model_filename)
print(f"Best RandomForestClassifier model saved as '{best_model_filename}'")

# Save the model before optimization
pre_optimization_model = RandomForestClassifier(random_state=42)
pre_optimization_model.fit(X_train, y_train)
pre_optimization_model_filename = 'pre_optimization_random_forest_model.pkl'
joblib.dump(pre_optimization_model, pre_optimization_model_filename)
print(f"Pre-optimization RandomForestClassifier model saved as '{pre_optimization_model_filename}'")


Best RandomForestClassifier model saved as 'best_random_forest_model.pkl'
Pre-optimization RandomForestClassifier model saved as 'pre_optimization_random_forest_model.pkl'


In [31]:
import pandas as pd
import joblib
import numpy as np

# Load training data
data2 = pd.read_csv("data22.csv")

# Load new data
newdata = pd.read_csv("newdata.csv")

# Load encoded data
encoded_data = pd.read_csv('leaveoneout_encoded.csv')

# Initialize lists to store mean target values
mean_target_temperature_list = []
mean_target_package_list = []
mean_target_preservative_list = []
mean_target_sterilization_list = []

# Iterate over each row in newdata
for index, row in newdata.iterrows():
    # Get predictor variable values from the current row
    temperature = row['temperature']
    package = row['package']
    preservative = row['preservative']
    sterilization = row['sterilization']
    
    # Find rows in data2 where the values of the features are identical
    matching_rows = data2[(data2['temperature'] == temperature) & 
                          (data2['package'] == package) & 
                          (data2['preservative'] == preservative) & 
                          (data2['sterilization'] == sterilization)]
    
    # Check if any matching rows were found
    if not matching_rows.empty:
        # Get the corresponding rows from encoded_data
        encoded_rows = encoded_data.loc[matching_rows.index]
        
        # Compute the mean target values from encoded data
        mean_target_temperature = encoded_rows['temperature'].mean()
        mean_target_package = encoded_rows['package'].mean()
        mean_target_preservative = encoded_rows['preservative'].mean()
        mean_target_sterilization = encoded_rows['sterilization'].mean()
    else:
        # If no matching rows were found, use the mean target values computed from data2
        mean_target_temperature = data2[(data2['temperature'] == temperature)]['target'].mean()
        mean_target_package = data2[(data2['package'] == package)]['target'].mean()
        mean_target_preservative = data2[(data2['preservative'] == preservative)]['target'].mean()
        mean_target_sterilization = data2[(data2['sterilization'] == sterilization)]['target'].mean()

    # Append the mean target values to the lists
    mean_target_temperature_list.append(mean_target_temperature)
    mean_target_package_list.append(mean_target_package)
    mean_target_preservative_list.append(mean_target_preservative)
    mean_target_sterilization_list.append(mean_target_sterilization)

# Update newdata with the mean target values
newdata['temperature'] = mean_target_temperature_list
newdata['package'] = mean_target_package_list
newdata['preservative'] = mean_target_preservative_list
newdata['sterilization'] = mean_target_sterilization_list

# Load the best predictive model
model = joblib.load('best_random_forest_model.pkl')

# Use the modified newdata to make predictions
predictions = model.predict(newdata)

# Print the predictions
print(predictions)


[1 2 1 1 2 2 2 2 3 2]
