In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
# Import regression models instead of classification models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import math
import pickle


# Load the dataset
file_name = '/content/ML Project Data fixed.xlsx'  # Ensure this file is in the same directory as your script
df = pd.read_excel(file_name)

# Data Preprocessing
# Encode categorical variables
label_encoder = LabelEncoder()

# no label encoding needed
df['Label'] = df['Label']

# Apply one-hot encoding to the feature columns (Token_0 to Token_9)
df_encoded = pd.get_dummies(df, columns=[f'Token_{i}' for i in range(10)], drop_first=True)

# Splitting the dataset into training and test sets
X = df_encoded.drop('Label', axis=1)
y = df_encoded['Label']

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data (for certain models like SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Machine Learning Models
models = {
    "Random Forest": RandomForestRegressor(random_state=42),
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "SVM": SVR()
}

# Training and evaluating the models
best_model = None
best_rmse = float('inf')

for model_name, model in models.items():
    print(f"Training {model_name}...")

    # Use scaled data for SVM, for others, use non-scaled data
    if model_name == "SVM":
        model.fit(X_train_scaled, y_train)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_test_pred = model.predict(X_test)

    # Calculate RMSE for the test dataset
    rmse_test = math.sqrt(mean_squared_error(y_test, y_test_pred))

    # Save the best model based on RMSE for the test dataset
    if rmse_test < best_rmse:
        best_rmse = rmse_test
        best_model = model
        best_model_name = model_name

    print(f"{model_name} RMSE on Test Data: {rmse_test}")

# Save the best model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

# Save the feature columns after preprocessing
with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(X_train.columns.tolist(), f)

print(f"Best model: {best_model_name} with RMSE: {best_rmse}")

Training Random Forest...
Random Forest RMSE on Test Data: 68.57110944031409
Training Linear Regression...
Linear Regression RMSE on Test Data: 56.48873255260203
Training Decision Tree...
Decision Tree RMSE on Test Data: 71.54451380364534
Training SVM...
SVM RMSE on Test Data: 70.57112988304745
Best model: Linear Regression with RMSE: 56.48873255260203


In [None]:
import subprocess
import sys
import streamlit as st
import pandas as pd
import pickle

# Force install scikit-learn if not found
try:
    import sklearn
except ModuleNotFoundError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "scikit-learn"])
    import sklearn  # Import again after installation


# Load the pre-trained model
with open('best_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Load the feature columns used during training
with open('feature_columns.pkl', 'rb') as f:
    required_columns = pickle.load(f)

# Title of the app
st.title("GUI for Pharmaceutical Removal via Biochar Adsorption")

# Sidebar inputs for user preferences
st.sidebar.header("Inputs")

TP = st.sidebar.selectbox(
    "Pharmaceutical Type",
    [
        'Benzocain', 'Ciprofloxacin', 'Citalopram', 'Diclofenac',
        'Dimetridazole', 'Floxentine', 'ibuprofen', 'Metronidazole',
        'Nitroimidazole', 'Norfloxacin', 'Oxytetracycline',
        'Salicylic acid', 'Sulfadiazine', 'Sulfamethazine',
        'Sulfamethoxazole', 'Tetracycline', 'Triclosan'
    ]
)
temp = st.sidebar.slider("Temperature", 300, 950, 300, 1)
time = st.sidebar.slider("Time (min)", 0.1, 480.0, 0.1, 0.1)
PS = st.sidebar.slider("PS", 1.32, 213.29, 1.32, 0.01)
BET = st.sidebar.slider("BET", 0.48, 1838.86, 0.48, 0.01)
PV = st.sidebar.slider("PV", 0.001, 1.03, 0.001, 0.001)
carbon = st.sidebar.slider("Carbon (%)", 9.46, 89.57, 9.46, 0.01)
hydrogen = st.sidebar.slider("Hydrogen (%)", 0.0, 10.30, 0.0, 0.01)
nitrogen = st.sidebar.slider("Nitrogen (%)", 0.0, 14.15, 0.0, 0.01)
oxygen = st.sidebar.slider("Oxygen (%)", 0.67, 55.01, 0.67, 0.01)

# Add a submit button
if st.button("Submit"):
    # Create input DataFrame
    input_data = pd.DataFrame({
        'Token_0': [TP],
        'Token_1': [temp],
        'Token_2': [time],
        'Token_3': [PS],
        'Token_4': [BET],
        'Token_5': [PV],
        'Token_6': [carbon],
        'Token_7': [hydrogen],
        'Token_8': [nitrogen],
        'Token_9': [oxygen]
    })

    # One-hot encode the input data
    input_encoded = pd.get_dummies(input_data)

    # Align input_encoded with the required columns from training
    for col in required_columns:
        if col not in input_encoded.columns:
            input_encoded[col] = 0
    input_encoded = input_encoded[required_columns]

    # Make the prediction
    prediction = model.predict(input_encoded)[0]

    # Display the prediction
    st.subheader(f"Qm (mg/g): {prediction}")