In [2]:
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')
import spacy
import requests
from sentence_transformers import SentenceTransformer, util

# Load the Sentence-BERT model
model = SentenceTransformer('paraphrase-MPNet-base-v2')  # This is a lightweight, fast model

with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

with open('tfidf_vectorizer_LLM.pkl', 'rb') as file:
    loaded_vectorizer_llm = pickle.load(file)

with open('X_scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

with open('xgboost_tfidf_LLM_v2.pkl', 'rb') as file:
    best_model = pickle.load(file)


  from tqdm.autonotebook import tqdm, trange


In [3]:
import numpy as np
import pandas as pd

In [4]:
bdpt_dict={}
bdpt_dict[0]='INTERNAL'
bdpt_dict[30]='SHOULDER'
bdpt_dict[31]='UPPERTRUNK'
bdpt_dict[32]='ELBOW'
bdpt_dict[33]='LOWERARM'
bdpt_dict[34]='WRIST'
bdpt_dict[35]='KNEE'
bdpt_dict[36]='LOWERLEG'
bdpt_dict[37]='ANKLE'
bdpt_dict[38]='PUBICREGION'
bdpt_dict[75]='HEAD'
bdpt_dict[76]='FACE'
bdpt_dict[77]='EYEBALL'
bdpt_dict[78]='UPPERTRUNK(OLD)'
bdpt_dict[79]='LOWERTRUNK'
bdpt_dict[80]='UPPERARM'
bdpt_dict[81]='UPPERLEG'
bdpt_dict[82]='HAND'
bdpt_dict[83]='FOOT'
bdpt_dict[84]='25-50% OF BODY'
bdpt_dict[85]='ALLPARTSBODY'
bdpt_dict[86]='OTHER(OLD)'
bdpt_dict[87]='NOTSTATED/UNK'
bdpt_dict[88]='MOUTH'
bdpt_dict[89]='NECK'
bdpt_dict[90]='LOWERARM(OLD)'
bdpt_dict[91]='LOWERLEG(OLD)'
bdpt_dict[92]='FINGER'
bdpt_dict[93]='TOE'
bdpt_dict[94]='EAR'

In [5]:
def create_df(X):

    sample_df_single_record=pd.DataFrame(X,columns=['Age', 'Sex','Location','Product_1' ,'activity_at_injury', 'object_involved','injury_mechanism','More_details'])
    
    sample_df_single_record["Age"] = pd.to_numeric(sample_df_single_record["Age"])
    sample_df_single_record["Sex"] = pd.to_numeric(sample_df_single_record["Sex"])
    sample_df_single_record["Location"] = pd.to_numeric(sample_df_single_record["Location"])
    sample_df_single_record["Product_1"] = pd.to_numeric(sample_df_single_record["Product_1"])

    sample_df_single_record['Narrative_LLM']=sample_df_single_record["activity_at_injury"].astype(str) + ' '+sample_df_single_record["injury_mechanism"].astype(str)+ ' ' + sample_df_single_record["object_involved"].astype(str)
    sample_df_single_record['Narrative']=sample_df_single_record["More_details"]

    sample_df_single_record_2=sample_df_single_record[["Age","Sex","Location","Product_1"]]

    return sample_df_single_record,sample_df_single_record_2

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token, pos in tagged_tokens]
    return ' '.join(lemmatized_tokens)

In [7]:
def gen_prediction(bdpt_dict,data_ready,scaler,best_model,sematic_distance_bert):


    prob_out={}

    k=0

    for i in bdpt_dict:
        data_ready.at[0,'Body_Part']=i
        #print(X_test_final)
        temp=pd.DataFrame()
        #print(len(sematic_distance_bert))
        #print(sematic_distance_bert[k])
        temp['sematic_distance']=[sematic_distance_bert[k]]
        #print(temp)
        data_ready_2=pd.concat([temp,data_ready], axis=1)
        #print(data_ready_2)

        X_test_scaled = scaler.transform(data_ready_2.select_dtypes(include=['number']))
        X_test_final=pd.DataFrame(X_test_scaled, columns=data_ready_2.select_dtypes(include=['number']).columns)
        y_prob = best_model.predict_proba(X_test_final)[:, 1][0]
        #print(y_prob)
        prob_out[str(i)]=y_prob
        k+=1
    
    return prob_out

In [8]:
def cal_similarity(sample_df_single_record,bdpt_dict):


    sematic_distance_bert=[]

    for key in bdpt_dict:
        #print(i)
        sentence1 = bdpt_dict[key] #"cut"
        sentence2 = sample_df_single_record.at[0,'Narrative_LLM'] #"bike"

        embedding1 = model.encode(sentence1, convert_to_tensor=True)
        embedding2 = model.encode(sentence2, convert_to_tensor=True)

        similarity = util.cos_sim(embedding1, embedding2).item()
        sematic_distance_bert.append(similarity)

    return sematic_distance_bert

In [9]:
def tfidf_narrative(sample_df_single_record,loaded_vectorizer):

    corpus = sample_df_single_record['Narrative'].fillna('')

    sample_df_single_record['Processed_Narrative'] = corpus.apply(preprocess_text)
    # Use the loaded vectorizer to transform new data
    tfidf_matrix = loaded_vectorizer.transform(sample_df_single_record['Processed_Narrative'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=loaded_vectorizer.get_feature_names_out())

    return tfidf_df

In [10]:
def tfidf_narrative_LLM(sample_df_single_record,loaded_vectorizer_llm):
    corpus_LLM = sample_df_single_record['Narrative_LLM'].fillna('')

    sample_df_single_record['Processed_Narrative_LLM'] = corpus_LLM.apply(preprocess_text)
    # Use the loaded vectorizer to transform new data
    tfidf_matrix_LLM = loaded_vectorizer_llm.transform(sample_df_single_record['Processed_Narrative_LLM'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df_LLM = pd.DataFrame(tfidf_matrix_LLM.toarray(), columns=loaded_vectorizer_llm.get_feature_names_out())
    tfidf_df_LLM=tfidf_df_LLM.add_suffix('_LLM')

    return tfidf_df_LLM


# Concatenate the TF-IDF features with your existing data

#data_ready = pd.concat([data_core_sample, tfidf_df_LLM], axis=1)

In [11]:
def Predict_Body_parts_Prob(Age, Sex,Location,Product_1 ,activity_at_injury, object_involved,injury_mechanism,More_details):
    
    X = np.column_stack([Age, Sex,Location,Product_1 ,activity_at_injury, object_involved,injury_mechanism,More_details])

    sample_df_single_record,sample_df_single_record_2 = create_df(X)

    sematic_distance_bert=cal_similarity(sample_df_single_record,bdpt_dict)

    tfidf_df = tfidf_narrative(sample_df_single_record,loaded_vectorizer)

    tfidf_df_LLM = tfidf_narrative_LLM(sample_df_single_record,loaded_vectorizer_llm)

    data = {"Body_Part": [25],}
    df_body= pd.DataFrame(data)

    data_ready = pd.concat([sample_df_single_record_2[['Age','Sex','Location']],df_body,sample_df_single_record_2[['Product_1']], tfidf_df,tfidf_df_LLM], axis=1)

    output = gen_prediction(bdpt_dict,data_ready,scaler,best_model,sematic_distance_bert)
    df_output = pd.DataFrame(list(output.items()), columns=['Key', 'Value'])
    # df_output.to_csv("dva_v4.csv", index=False)
    return df_output

In [12]:
Predict_Body_parts_Prob(25, 1,1,3258 ,"fall walk", "bike","","fall from bike when riding")

Unnamed: 0,Key,Value
0,0,0.760949
1,30,0.149621
2,31,0.4688
3,32,0.414058
4,33,0.401258
5,34,0.251715
6,35,0.259538
7,36,0.487792
8,37,0.290905
9,38,0.386126


In [41]:
from tabpy.tabpy_tools.client import Client

In [49]:
client = Client('http://tabpy.ericy.me:8888/')

client.set_credentials(username='dva023', password='YL8bar-_3.jXGFet')
client.deploy('dva_v4', Predict_Body_parts_Prob, 'Prediction Model v4', override=True)
# client.deploy('dva_v4', Predict_Body_parts_Prob, 'Prediction Model v4', override=True, path='/endpoints/dva_v4/1')


Overwriting existing file "/home/eric/workspace/neiss/tabpy_server/staging\endpoints\dva_v4\1" when saving query object
Error with server response. code=500; text={"message": "error adding endpoint", "info": "FileNotFoundError : [Errno 2] No such file or directory: '/home/eric/workspace/neiss/tabpy_server/staging\\\\endpoints\\\\dva_v4\\\\1'"}


ResponseError: (500) error adding endpoint FileNotFoundError : [Errno 2] No such file or directory: '/home/eric/workspace/neiss/tabpy_server/staging\\endpoints\\dva_v4\\1'