# Overview
This notebook documents the process of developing a machine learning model for predicting gout occurrence using data in the Amazon SageMaker environment. Gout is a type of arthritis caused by the buildup of uric acid crystals in the joints, leading to inflammation and pain. By leveraging machine learning techniques, we aim to predict the likelihood of gout in individuals based on various features.


# Libraries

In [3]:
import pandas as pd
import psycopg2
# import spacy
import time 
# Load spaCy's English model
# nlp = spacy.load("en_core_web_sm")

from sklearn.pipeline import Pipeline
from predict_and_decode import PredictAndDecode


In [None]:
print(hello)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role
import seaborn as sns
import matplotlib.pyplot as plt

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\sourodeep.das\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py)

# DB Connections

In [11]:
def db_connection(query):
    # Database connection parameters
    db_name = 'factihealth'   # Database name
    db_user = 'fh_user'  # Username
    db_password = 'Facti@874'  # Password
    db_host = 'redshift-cluster-factihealth.cuzgotkwtow6.ap-south-1.redshift.amazonaws.com'  # Cluster endpoint
    db_port = 5439  # Port
    
    try:
        # Connect to the database
        conn = psycopg2.connect(
            dbname=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port
        )
        print("Connected to the database successfully")
        
        # Create a cursor object
        cur = conn.cursor()
        
        # Execute a query
        cur.execute(query)
        
        # Fetch all rows
        rows = cur.fetchall()
        
        if not rows:
            print("No rows returned from the query.")
            return None

        # Get column names from the cursor description
        col_names = [desc[0] for desc in cur.description]
        
        # Create a DataFrame from the fetched rows and column names
        df = pd.DataFrame(rows, columns=col_names)
        
        # Close the cursor and connection
        cur.close()
        conn.close()
        
        return df
    except Exception as e:
        print(f"Database connection failed due to {e}")
        return None

In [12]:
# Example usage
query = 'Select * from mimic.gout_corpus'
data = db_connection(query)
dataset = data.copy(deep=True)
# Check if dataset is not None before using tail
display(dataset.tail(5))

Connected to the database successfully


Unnamed: 0,chief_complaint,predict,consensus,year
8432,"stepped on a nail at home with right foot, pai...",N,N,2020
8433,""" I was having a breakdown."" R/T stress and de...",N,-,2020
8434,"""I tried to jump in front of a car"" Pt states ...",N,-,2020
8435,Abdominal pain x 1 week. Denies PMH,N,-,2020
8436,"Rash/sores across body, infection ro left thum...",N,N,2020


# Data Cleaning

In [13]:
dataset['predict'].unique()

array(['N', 'Y', 'U', '-'], dtype=object)

In [14]:
dataset['predict'].value_counts(normalize=True)*100

predict
N    96.811663
U     1.848998
Y     1.315634
-     0.023705
Name: proportion, dtype: float64

In [15]:
## Remove the Unknown Data
dataset = dataset[~(dataset.predict=='-')]
print(dataset.shape)
dataset['predict'].unique()

(8435, 4)


array(['N', 'Y', 'U'], dtype=object)

In [16]:
dataset['consensus'].unique()

array(['-', 'N', 'Y', 'U'], dtype=object)

In [17]:
dataset['consensus'].value_counts(normalize=True)*100

consensus
-    94.534677
N     4.149378
Y     1.126260
U     0.189686
Name: proportion, dtype: float64

In [18]:
dataset.chief_complaint.nunique()

8427

In [19]:
dataset[dataset.chief_complaint.isna()]

Unnamed: 0,chief_complaint,predict,consensus,year


In [20]:
dataset['predict'].value_counts()

predict
N    8168
U     156
Y     111
Name: count, dtype: int64

In [8]:
156/(8168+156+111)

0.01849436870183758

In [21]:
dataset.shape

(8435, 4)

In [22]:
dataset['predict'].unique()

array(['N', 'Y', 'U'], dtype=object)

In [23]:
entity_df = dataset.copy(deep=True)

## Create Entities

# Model Building

In [20]:
entity_df.head()

Unnamed: 0,chief_complaint,predict,consensus,year,entities,entity
0,"""been feeling bad"" last 2 weeks & switched BP ...",N,-,2019,"{'DATE': ['last 2 weeks', 'last week &'], 'ORG...","[(last 2 weeks, DATE), (BP, ORG), (last week &..."
1,"""can't walk"", reports onset at <<TIME>>. orien...",Y,N,2019,"{'DATE': ['last week'], 'ORG': ['UTI', 'CVA']}","[(last week, DATE), (UTI, ORG), (CVA, ORG)]"
2,"""dehydration"" Chest hurts, hips hurt, cramps P...",Y,Y,2019,{'DATE': ['today']},"[(today, DATE)]"
3,"""gout flare up"" L arm swelling x 1 week. denie...",Y,Y,2019,{'DATE': ['1 week']},"[(1 week, DATE)]"
4,"""heart racing,""dyspnea, and orthopnea that has...",N,-,2019,"{'DATE': ['the last 3 days'], 'CARDINAL': ['12...","[(the last 3 days, DATE), (122, CARDINAL), (PT..."


In [17]:
# Assuming entity_df is your DataFrame
# If needed, replace 'chief_complaint' with the column name containing the text data
X = entity_df['chief_complaint']
y = entity_df['predict']

# Feature engineering - TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.4, random_state=42)

## Basic RandomForrest Model

In [22]:
# Class weights based on your class distribution
class_weights = {'N': 1, 'U': len(y_train[y_train=='N']) / len(y_train[y_train=='U']), 'Y': len(y_train[y_train=='N']) / len(y_train[y_train=='Y'])}

# Choose a model (Random Forest Classifier in this example) with class_weight parameter
model = RandomForestClassifier(random_state=42, class_weight=class_weights)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

# Display results for the test set
print("Test Set Accuracy:", accuracy)
print("Test Set Classification Report:\n", classification_report_result)

Test Set Accuracy: 0.9676941315945465
Test Set Classification Report:
               precision    recall  f1-score   support

           N       0.97      1.00      0.98      3266
           U       0.00      0.00      0.00        64
           Y       0.00      0.00      0.00        44

    accuracy                           0.97      3374
   macro avg       0.32      0.33      0.33      3374
weighted avg       0.94      0.97      0.95      3374



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
y_test.value_counts()

predict
N    3266
U      64
Y      44
Name: count, dtype: int64

In [24]:
validation_predict = model.predict(X_test)
validation_predict.tolist()
pd.Series(validation_predict.tolist()).value_counts()


N    3373
Y       1
Name: count, dtype: int64

# SMOTE Analysis

## Random Forrest

In [25]:
# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Define the RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'class_weight': ['balanced', None],  # Add 'balanced' for automatic class weights
}

# Use GridSearchCV to search for the best hyperparameters
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='f1_macro', cv=5)
grid_search.fit(X_resampled, y_resampled)

# Get the best parameters
best_params = grid_search.best_params_
print(best_params)

# Train the final model with the best parameters on the resampled data
final_model = RandomForestClassifier(random_state=42, **best_params)
final_model.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_test = final_model.predict(X_test)

# Evaluate the final model on the test set
classification_report_result_test = classification_report(y_test, y_pred_test)

# Display results for the test set
print("Test Set Classification Report:\n", classification_report_result_test)

Test Set Classification Report:
               precision    recall  f1-score   support

           N       0.97      1.00      0.98      3266
           U       0.50      0.03      0.06        64
           Y       0.86      0.14      0.24        44

    accuracy                           0.97      3374
   macro avg       0.78      0.39      0.43      3374
weighted avg       0.96      0.97      0.96      3374



## XG Boost

In [23]:
t1 = time.time()
# Assuming y_train and y_test are your target variables
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train_encoded)


xgb_classifier = xgb.XGBClassifier(
    random_state=42,
    objective='multi:softmax',  # Specify the objective for multi-class classification
    num_class=len(label_encoder.classes_),  # Number of classes
    # class_weight can be used for balancing classes
    # You may adjust the weights based on the class distribution
    class_weight={'N': 1, 'U': len(y_train[y_train=='N']) / len(y_train[y_train=='U']),
                  'Y': len(y_train[y_train=='N']) / len(y_train[y_train=='Y'])},
)


# Define hyperparameters to search over
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Use GridSearchCV to search for the best hyperparameters
grid_search_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, scoring='f1_macro', cv=5)
grid_search_xgb.fit(X_resampled, y_resampled)

# Get the best parameters
best_params_xgb = grid_search_xgb.best_params_
print(best_params_xgb)

# Train the final XGBoost model with the best parameters on the resampled data
final_model_xgb = xgb.XGBClassifier(random_state=42, **best_params_xgb)
final_model_xgb.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_test_xgb = final_model_xgb.predict(X_test)

# Evaluate the final XGBoost model on the test set
classification_report_result_xgb = classification_report(y_test, y_pred_test_xgb)
# Display results for the test set
print("XGBoost Test Set Classification Report:\n", classification_report_result_xgb)

# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_test_xgb)
print(cm)
print((time.time() -t1)/60)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" } are not used.

Parameters: { "class_weight" }

ValueError: Mix of label input types (string and number)

In [None]:
print(best_params_xgb)

## SVC

In [24]:
# Assuming entity_df is your DataFrame
# If needed, replace 'chief_complaint' with the column name containing the text data
X = entity_df['chief_complaint']
y = entity_df['predict']

# Feature engineering - TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.8, random_state=42)

In [26]:
t1 = time.time()
from imblearn.over_sampling import SMOTE

# Assuming y_train and y_test are your target variables
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Apply SMOTE to oversample the minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train_encoded)

# Define the SVM classifier
svm_classifier = SVC(random_state=42)

# Define hyperparameters to search over
param_grid = {
    'C': [1],
    'kernel': ['rbf'],
}

# # Define hyperparameters to search over
# param_grid = {
#     'C': [0.1, 1,  5,  10],
#     'kernel': ['linear', 'rbf'],
# }

# Use GridSearchCV to search for the best hyperparameters
grid_search_svm = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, scoring='f1_macro', cv=5)
grid_search_svm.fit(X_resampled, y_resampled)

# Get the best parameters
best_params_svm = grid_search_svm.best_params_

# Train the final SVM model with the best parameters on the resampled data
final_model_svm = SVC(random_state=42, **best_params_svm)
final_model_svm.fit(X_resampled, y_resampled)

# Make predictions on the test set
y_pred_test_svm = final_model_svm.predict(X_test)

# Evaluate the final SVM model on the test set
classification_report_result_svm = classification_report(y_test_encoded, y_pred_test_svm)



# Display results for the test set
print("SVM Test Set Classification Report:\n", classification_report_result_svm)
# Visualize the confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred_test_svm)
print((time.time() -t1)/60)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\sourodeep.das\AppData\Roaming\Python\Python311\site-packages\sklearn\utils\_param_validation.py)

In [46]:
best_params_svm


{'C': 1, 'kernel': 'rbf'}

In [47]:
label_encoder.classes_


array(['N', 'U', 'Y'], dtype=object)

In [48]:
# Suppose new_sentence is the new text you want to predict
new_sentence = "No gout but pain in knees"

# Transform the new sentence using the TfidfVectorizer from the pipeline
new_sentence_vectorized = tfidf_vectorizer.transform([new_sentence])

# Use the trained SVM model to predict the class of the new sentence
predicted_class = final_model_svm.predict(new_sentence_vectorized)

# Suppose label_encoder is the LabelEncoder instance used during training
decoded_predicted_class = label_encoder.inverse_transform(predicted_class)

# Display the decoded predicted class
print("Decoded Predicted Class:", decoded_predicted_class[0])


# Display the predicted class
print("Predicted Class:", predicted_class[0])


Decoded Predicted Class: N
Predicted Class: 0


### SVC Model Dump

In [26]:
import joblib
import tarfile
from sagemaker.sklearn.estimator import SKLearn

# Save the model
model_filename = 'svc_model.tar.gz'
joblib.dump(final_model_svm, 'svc_model.joblib')  

# Create a tar.gz file with the correct structure
with tarfile.open(model_filename, 'w:gz') as tar:
    tar.add('svc_model.joblib', arcname='svc_model.joblib') 

print(f'Model saved as {model_filename}')

Model saved as svc_model.tar.gz


In [None]:
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role

# Get the SageMaker execution role
role = get_execution_role()

# Specify the S3 location of your saved model artifacts (tar.gz file)
model_location = 's3://factspan-data-products/GoutPrediction/model_data/svc_model.tar.gz'

# Specify the local path to the inference script
entry_point_local = 'inference.py'

# Specify the local directory containing inference script
source_dir_local = 'source_code'

# Create an SKLearnModel for inference with IAM role and saved model artifacts
svc_model = SKLearnModel(model_data=model_location,
                         role=role,
                         entry_point=entry_point_local,  # Your local inference script
                         source_dir=source_dir_local,  # Local directory containing inference script
                         framework_version='1.2-1')  # Version compatible with your trained model

i+=1
# Deploy the trained SVC model to an endpoint
predictor = svc_model.deploy(instance_type='ml.m4.xlarge', 
                             endpoint_name=f'svc-prediction-endpoint-v{i}', 
                             initial_instance_count=1)


# Pipeline 1

In [49]:
t1 = time.time()
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# Assuming entity_df is your DataFrame
# If needed, replace 'chief_complaint' with the column name containing the text data
X = entity_df['chief_complaint']
y = entity_df['predict']

# Feature engineering - TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Assuming y_train is your target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Create a pipeline with TF-IDF, PredictAndDecode, and SVM
pipeline = Pipeline([
    ('tfidf_vectorizer', TfidfVectorizer()),
    ('predict_and_decode', PredictAndDecode(model=final_model_svm, label_encoder=label_encoder)),
])

# Train the pipeline on the data
pipeline.fit(X, y_encoded)

# Suppose new_sentence is the new text you want to predict
new_sentence = "possibility oft"

# Use the pipeline to predict and decode the class of the new sentence
predicted_class = pipeline.transform([new_sentence])

# Display the predicted class
print("Predicted Class:", predicted_class[0][0])
print((time.time() -t1)/60)

Predicted Class: N
0.005481843153635661


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


## Convert to Archive tar.gz file

In [52]:
import joblib
import tarfile
import pickle

# Save the model
model_filename = 'gout_pipeline.tar.gz'
joblib.dump(pipeline, 'gout_pipeline.joblib')  # Save the serialized model to 'model.joblib'

filehandler = open("gout_pipeline.pkl","wb")
pickle.dump(pipeline, filehandler)

# Create a tar.gz file with the correct structure
with tarfile.open(model_filename, 'w:gz') as tar:
    tar.add('gout_pipeline.joblib', arcname='gout_pipeline.joblib')  # Include the model file with the correct name

print(f'Model saved as {model_filename}')


Model saved as gout_pipeline.tar.gz


# Create Endpoint

In [113]:
i=0

In [114]:
import sagemaker
from sagemaker.sklearn.model import SKLearnModel
from sagemaker import get_execution_role

# Get the SageMaker execution role
role = get_execution_role()

# Specify the S3 location of your saved model artifacts (tar.gz file)
model_location = 's3://factspan-data-products/models/gout_pipeline.tar.gz'

# Specify the local path to the inference script
entry_point_local = 'inference.py'

# Specify the local directory containing inference script
source_dir_local = 'source_code'

# Create an SKLearnModel for inference with IAM role and saved model artifacts
svc_model = SKLearnModel(model_data=model_location,
                         role=role,
                         entry_point=entry_point_local,  # Your local inference script
                         source_dir=source_dir_local,  # Local directory containing inference script
                         framework_version='1.2-1')  # Version compatible with your trained model

i+=1
endpoint_name = f'gout-prediction-v{i}'
print(endpoint_name)
# Deploy the trained SVC model to an endpoint
predictor = svc_model.deploy(instance_type='ml.m4.xlarge', 
                             endpoint_name=endpoint_name, 
                             initial_instance_count=1)


gout-prediction-v6
------!

In [116]:
endpoint_name

'gout-prediction-v6'

In [117]:
import boto3
import json

# Specify the endpoint name
endpoint_name = endpoint_name

# Create a SageMaker client
sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='ap-south-1')

# Example plain text input, replace this with your actual input data
plain_text_input = 'knee pain or gout'

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/plain',
    Body=plain_text_input
)

# Read the response body as a string
response_body_str = response['Body'].read().decode('utf-8')
response_body_str


'"Y"'

In [121]:
import boto3
import json

# Specify the endpoint URL
endpoint_url = f"https://runtime.sagemaker.ap-south-1.amazonaws.com/endpoints/{endpoint_name}/invocations"

# Example plain text input
plain_text_input = 'Your text input goes here.'

# Create a SageMaker runtime client
sagemaker_runtime = boto3.client('sagemaker-runtime', region_name='ap-south-1')

# Invoke the endpoint
response = sagemaker_runtime.invoke_endpoint(
    EndpointName=endpoint_name,  # Provide the correct endpoint name
    ContentType='text/plain',
    Body=plain_text_input.encode('utf-8')
)

# Check the HTTP status code
status_code = response['ResponseMetadata']['HTTPStatusCode']

if status_code == 200:
    try:
        # Attempt to parse the response as JSON
        result_json = response['Body'].read().decode('utf-8')
        result_dict = json.loads(result_json)
        print(result_dict)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
else:
    print(f"Endpoint returned non-200 status code: {status_code}")
    # Handle the error or return an appropriate response


N
