### Contains Text Classification Pipeline which is to find surgery type from doctor notes

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import streamlit as st
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load the doctor notes and patient records dataset
doctor_notes = pd.read_csv('doctor_notes.csv')
doctor_notes.set_index('Patient ID', inplace=True)
doctor_notes.sort_index(inplace=True)

patient_records = pd.read_csv('patient_records.csv')
patient_records.drop('Unnamed: 0', axis = 1, inplace=True)
patient_records.set_index('Patient ID', inplace=True)
patient_records.sort_index(inplace=True)

# Preprocess the data
X_notes = doctor_notes['Doctor Notes']
y_surgery = patient_records['Surgery Type']

# Split the data
X_train_notes, X_test_notes, y_train_notes, y_test_notes = train_test_split(X_notes, y_surgery, test_size=0.2, random_state=42)

# Create a text classification pipeline
text_clf = make_pipeline(TfidfVectorizer(), LogisticRegression())

# Train the model
text_clf.fit(X_train_notes, y_train_notes)

# Evaluate the model
y_pred_notes = text_clf.predict(X_test_notes)

print('Accuracy:', accuracy_score(y_test_notes, y_pred_notes))

Accuracy: 1.0


In [40]:
# Handle 'Not determined' values in the Outcome column
patient_records['Outcome'] = patient_records['Outcome'].replace('Not determined', np.nan)

# Drop rows with NaN values in the Outcome column
patient_records_encoded = patient_records.dropna(subset=['Outcome'])

# Preprocess patient records
label_encoder = {}
categorical_columns = ['Gender', 'Outcome', 'Surgery Type']  # Add other categorical columns as needed
for col in categorical_columns:
    le = LabelEncoder()
    patient_records_encoded[col] = le.fit_transform(patient_records_encoded[col])
    label_encoder[col] = le

# Define features and target for the determined dataset
X_determined = patient_records_encoded.drop('Outcome', axis=1)
y_determined = patient_records_encoded['Outcome']

# Train the Random Forest model (assuming this is the final model)
X_train, X_test, y_train, y_test = train_test_split(X_determined, y_determined, test_size=0.2, random_state=42)
# Initialize the scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
logistic_regression = LogisticRegression(random_state=42)
logistic_regression.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = logistic_regression.predict(X_test_scaled)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print('Classification Report:\n', classification_report(y_test, y_pred))

Accuracy: 0.7948717948717948
Confusion Matrix:
 [[ 8  6]
 [10 54]]
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.57      0.50        14
           1       0.90      0.84      0.87        64

    accuracy                           0.79        78
   macro avg       0.67      0.71      0.69        78
weighted avg       0.82      0.79      0.80        78



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_records_encoded[col] = le.fit_transform(patient_records_encoded[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_records_encoded[col] = le.fit_transform(patient_records_encoded[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  patient_records_encoded[col] = le.fit_transform(p