In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
import spacy
import requests
from sentence_transformers import SentenceTransformer, util

# Load the Sentence-BERT model
model = SentenceTransformer('paraphrase-MPNet-base-v2')  # This is a lightweight, fast model

with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

with open('tfidf_vectorizer_LLM.pkl', 'rb') as file:
    loaded_vectorizer_llm = pickle.load(file)

with open('X_scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

with open('xgboost_tfidf_LLM_v2.pkl', 'rb') as file:
    best_model = pickle.load(file)

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from tqdm.autonotebook import tqdm, trange
2024-11-07 05:50:51.021328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730958651.040947   37884 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN wh

In [2]:
import numpy as np
import pandas as pd

In [3]:
bdpt_dict={}
bdpt_dict[0]='INTERNAL'
bdpt_dict[30]='SHOULDER'
bdpt_dict[31]='UPPERTRUNK'
bdpt_dict[32]='ELBOW'
bdpt_dict[33]='LOWERARM'
bdpt_dict[34]='WRIST'
bdpt_dict[35]='KNEE'
bdpt_dict[36]='LOWERLEG'
bdpt_dict[37]='ANKLE'
bdpt_dict[38]='PUBICREGION'
bdpt_dict[75]='HEAD'
bdpt_dict[76]='FACE'
bdpt_dict[77]='EYEBALL'
bdpt_dict[78]='UPPERTRUNK(OLD)'
bdpt_dict[79]='LOWERTRUNK'
bdpt_dict[80]='UPPERARM'
bdpt_dict[81]='UPPERLEG'
bdpt_dict[82]='HAND'
bdpt_dict[83]='FOOT'
bdpt_dict[84]='25-50% OF BODY'
bdpt_dict[85]='ALLPARTSBODY'
bdpt_dict[86]='OTHER(OLD)'
bdpt_dict[87]='NOTSTATED/UNK'
bdpt_dict[88]='MOUTH'
bdpt_dict[89]='NECK'
bdpt_dict[90]='LOWERARM(OLD)'
bdpt_dict[91]='LOWERLEG(OLD)'
bdpt_dict[92]='FINGER'
bdpt_dict[93]='TOE'
bdpt_dict[94]='EAR'

In [4]:
def create_df(X):

    sample_df_single_record=pd.DataFrame(X,columns=['Age', 'Sex','Location','Product_1' ,'activity_at_injury', 'object_involved','injury_mechanism','More_details'])
    
    sample_df_single_record["Age"] = pd.to_numeric(sample_df_single_record["Age"])
    sample_df_single_record["Sex"] = pd.to_numeric(sample_df_single_record["Sex"])
    sample_df_single_record["Location"] = pd.to_numeric(sample_df_single_record["Location"])
    sample_df_single_record["Product_1"] = pd.to_numeric(sample_df_single_record["Product_1"])

    sample_df_single_record['Narrative_LLM']=sample_df_single_record["activity_at_injury"].astype(str) + ' '+sample_df_single_record["injury_mechanism"].astype(str)+ ' ' + sample_df_single_record["object_involved"].astype(str)
    sample_df_single_record['Narrative']=sample_df_single_record["More_details"]

    sample_df_single_record_2=sample_df_single_record[["Age","Sex","Location","Product_1"]]

    return sample_df_single_record,sample_df_single_record_2

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stopwords.words('english')]
    tagged_tokens = nltk.pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos='v') for token, pos in tagged_tokens]
    return ' '.join(lemmatized_tokens)

In [6]:
def gen_prediction(bdpt_dict,data_ready,scaler,best_model,sematic_distance_bert):


    prob_out={}

    k=0

    for i in bdpt_dict:
        data_ready.at[0,'Body_Part']=i
        #print(X_test_final)
        temp=pd.DataFrame()
        #print(len(sematic_distance_bert))
        #print(sematic_distance_bert[k])
        temp['sematic_distance']=[sematic_distance_bert[k]]
        #print(temp)
        data_ready_2=pd.concat([temp,data_ready], axis=1)
        #print(data_ready_2)

        X_test_scaled = scaler.transform(data_ready_2.select_dtypes(include=['number']))
        X_test_final=pd.DataFrame(X_test_scaled, columns=data_ready_2.select_dtypes(include=['number']).columns)
        y_prob = best_model.predict_proba(X_test_final)[:, 1][0]
        #print(y_prob)
        prob_out[str(i)]=y_prob
        k+=1
    
    return prob_out

In [7]:
def cal_similarity(sample_df_single_record,bdpt_dict):


    sematic_distance_bert=[]

    for key in bdpt_dict:
        #print(i)
        sentence1 = bdpt_dict[key] #"cut"
        sentence2 = sample_df_single_record.at[0,'Narrative_LLM'] #"bike"

        embedding1 = model.encode(sentence1, convert_to_tensor=True)
        embedding2 = model.encode(sentence2, convert_to_tensor=True)

        similarity = util.cos_sim(embedding1, embedding2).item()
        sematic_distance_bert.append(similarity)

    return sematic_distance_bert

In [8]:
def tfidf_narrative(sample_df_single_record,loaded_vectorizer):

    corpus = sample_df_single_record['Narrative'].fillna('')

    sample_df_single_record['Processed_Narrative'] = corpus.apply(preprocess_text)
    # Use the loaded vectorizer to transform new data
    tfidf_matrix = loaded_vectorizer.transform(sample_df_single_record['Processed_Narrative'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=loaded_vectorizer.get_feature_names_out())

    return tfidf_df

In [9]:
def tfidf_narrative_LLM(sample_df_single_record,loaded_vectorizer_llm):
    corpus_LLM = sample_df_single_record['Narrative_LLM'].fillna('')

    sample_df_single_record['Processed_Narrative_LLM'] = corpus_LLM.apply(preprocess_text)
    # Use the loaded vectorizer to transform new data
    tfidf_matrix_LLM = loaded_vectorizer_llm.transform(sample_df_single_record['Processed_Narrative_LLM'])

    # Convert the TF-IDF matrix to a DataFrame
    tfidf_df_LLM = pd.DataFrame(tfidf_matrix_LLM.toarray(), columns=loaded_vectorizer_llm.get_feature_names_out())
    tfidf_df_LLM=tfidf_df_LLM.add_suffix('_LLM')

    return tfidf_df_LLM


# Concatenate the TF-IDF features with your existing data

#data_ready = pd.concat([data_core_sample, tfidf_df_LLM], axis=1)

In [10]:
def Predict_Body_parts_Prob(Age, Sex,Location,Product_1 ,activity_at_injury, object_involved,injury_mechanism,More_details):
    X = np.column_stack([Age, Sex,Location,Product_1 ,activity_at_injury, object_involved,injury_mechanism,More_details])

    sample_df_single_record,sample_df_single_record_2 = create_df(X)

    sematic_distance_bert=cal_similarity(sample_df_single_record,bdpt_dict)

    tfidf_df = tfidf_narrative(sample_df_single_record,loaded_vectorizer)

    tfidf_df_LLM = tfidf_narrative_LLM(sample_df_single_record,loaded_vectorizer_llm)

    data = {"Body_Part": [25],}
    df_body= pd.DataFrame(data)

    data_ready = pd.concat([sample_df_single_record_2[['Age','Sex','Location']],df_body,sample_df_single_record_2[['Product_1']], tfidf_df,tfidf_df_LLM], axis=1)
    
    #print(data_ready)

    output = gen_prediction(bdpt_dict,data_ready,scaler,best_model,sematic_distance_bert)
    
    return output

In [11]:
Predict_Body_parts_Prob(25, 1,1,3258 ,"fall walk", "bike","","fall from bike when riding")

In [None]:
xxxx

In [None]:
import pandas as pd
data = {
    "Age": [25],
    "Sex": [0],
    "Location": [1],
    "Product_1": [3258],
    "activity_at_injury": ["fall walk"],
    "object_involved": ["bike"],
    "injury_mechanism": [" "],
    "More details": ["fall from bike when riding"]
}

sample_df_single_record = pd.DataFrame(data)
sample_df_single_record

In [None]:
sample_df_single_record['Narrative_LLM']=sample_df_single_record["activity_at_injury"].astype(str) + ' '+sample_df_single_record["injury_mechanism"].astype(str)+ ' ' + sample_df_single_record["object_involved"].astype(str)
sample_df_single_record['Narrative']=sample_df_single_record["More details"]

sample_df_single_record_2=sample_df_single_record[["Age","Sex","Location","Product_1"]]
sample_df_single_record_2

In [None]:
# Load the SpaCy model
nlp = spacy.load('en_core_web_md')

sematic_distance=[]

for key in bdpt_dict:

    #print(key)
    #print(i)
    word1 = bdpt_dict[key] #"cut"
    word2 = sample_df_single_record.at[0,'Narrative_LLM'] #"bike"

    # SpaCy similarity
    doc1 = nlp(word1)
    doc2 = nlp(word2)
    similarity = doc1.similarity(doc2)
    sematic_distance.append(similarity)

In [None]:
data = {
    "Body_Part": [25],
}
df_body= pd.DataFrame(data)
data_ready = pd.concat([sample_df_single_record_2[['Age','Sex','Location']],df_body,sample_df_single_record_2[['Product_1']], tfidf_df,tfidf_df_LLM], axis=1)

In [None]:







prob_out

In [None]:
xxxx
def Predict_Body_parts_Prob(Age, Sex, Race,Location,Hispanic, Product_1 ,Alcohol, Drug):
    X = np.column_stack([Age, Sex, Race,Location,Hispanic, Product_1 ,Alcohol, Drug])
    
    return loaded_model.predict_proba(X)

In [None]:
!python -m spacy download en_core_web_md