In [5]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
import joblib
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split

nlp = spacy.load("en_core_web_sm")

In [6]:
df = pd.read_csv('overview-of-recordings.csv')
df_text = df[['phrase', 'prompt']]

In [7]:
df_text

Unnamed: 0,phrase,prompt
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound
...,...,...
6656,I feel a burning sensation in my guts about 2 ...,Stomach ache
6657,I have a split on my thumb that will not heal.,Open wound
6658,I feel a lot of pain in the joints.,Joint pain
6659,The area around my heart doesn't feel good.,Heart hurts


In [14]:
df_text["prompt"].unique()

array(['Emotional pain', 'Hair falling out', 'Heart hurts',
       'Infected wound', 'Foot ache', 'Shoulder pain',
       'Injury from sports', 'Skin issue', 'Stomach ache', 'Knee pain',
       'Joint pain', 'Hard to breath', 'Head ache', 'Body feels weak',
       'Feeling dizzy', 'Back pain', 'Open wound', 'Internal pain',
       'Blurry vision', 'Acne', 'Muscle pain', 'Neck pain', 'Cough',
       'Ear ache', 'Feeling cold'], dtype=object)

In [15]:
df_text["prompt"].value_counts()

prompt
Acne                  328
Shoulder pain         320
Joint pain            318
Infected wound        306
Knee pain             305
Cough                 293
Feeling dizzy         283
Muscle pain           282
Heart hurts           273
Ear ache              270
Hair falling out      264
Feeling cold          263
Head ache             263
Skin issue            262
Stomach ache          261
Back pain             259
Neck pain             251
Internal pain         248
Blurry vision         246
Body feels weak       241
Hard to breath        233
Emotional pain        231
Injury from sports    230
Foot ache             223
Open wound            208
Name: count, dtype: int64

In [8]:
# training (80%) and temp (20%)
train_df, temp_df = train_test_split(df_text, test_size=0.2, random_state=42)

# validation and test (each 10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

Train shape: (5328, 2)
Validation shape: (666, 2)
Test shape: (667, 2)


In [9]:
train_df

Unnamed: 0,phrase,prompt
3497,I have acne all over my face,Acne
418,i have a problem in seeing objects it is too d...,Blurry vision
3306,i cant sleep because of cough,Cough
5510,I have a very rash sensation close to my arms,Skin issue
6656,I feel a burning sensation in my guts about 2 ...,Stomach ache
...,...,...
3772,I can't carry anything I have a pain in my sho...,Shoulder pain
5191,There is an injured person,Infected wound
5226,I have acne in my face and other problema in m...,Skin issue
5390,I have a sharp pain in my neck,Neck pain


In [10]:
test_df

Unnamed: 0,phrase,prompt
4043,I feel like my heart is on fire.,Heart hurts
420,I feel abdominal pain,Stomach ache
5803,I think my wound is infected,Infected wound
334,I feel a tightness in my chest,Hard to breath
1545,I do not feel better in my muscles,Muscle pain
...,...,...
2881,I can't work good I have a pain in my knee,Knee pain
4030,I have acne all over my face,Acne
2210,I wake up with a stiff neck every morning. Ma...,Neck pain
6133,I do not feel better in my muscles,Muscle pain


In [16]:
def preprocess(text):
    doc = nlp(text.lower().strip())
    tokens = []
    for token in doc:
        if token.text in STOP_WORDS:
            continue
        if token.is_punct:
            continue
        tokens.append(token.lemma_)
    return " ".join(tokens)

# preprocessing
train_df['clean_phrase'] = train_df['phrase'].apply(preprocess)
val_df['clean_phrase'] = val_df['phrase'].apply(preprocess)
test_df['clean_phrase'] = test_df['phrase'].apply(preprocess)

# Feature Engineering
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    sublinear_tf=True
)

X_train = tfidf.fit_transform(train_df['clean_phrase'])
X_val = tfidf.transform(val_df['clean_phrase'])
X_test = tfidf.transform(test_df['clean_phrase'])

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['prompt'])
y_val = label_encoder.transform(val_df['prompt'])
y_test = label_encoder.transform(test_df['prompt'])

# Model 
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=200),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    val_preds = model.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    f1 = f1_score(y_val, val_preds, average='weighted')
    print(f"{name}:")
    print(f"  Validation Accuracy: {acc:.3f}")
    print(f"  Validation F1-Score: {f1:.3f}")
    
    if f1 > best_score:
        best_score = f1
        best_model = model

# evaluate on test set
test_preds = best_model.predict(X_test)
print("Test Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_preds):.3f}")
print(f"F1-Score: {f1_score(y_test, test_preds, average='weighted'):.3f}")

# Save pipeline
joblib.dump({
    'tfidf': tfidf,
    'model': best_model,
    'label_encoder': label_encoder
}, "traditional_ml_pipeline.pkl")

Logistic Regression:
  Validation Accuracy: 0.988
  Validation F1-Score: 0.988
Random Forest:
  Validation Accuracy: 0.995
  Validation F1-Score: 0.995


Parameters: { "use_label_encoder" } are not used.



XGBoost:
  Validation Accuracy: 0.995
  Validation F1-Score: 0.995

Test Performance:
Accuracy: 0.997
F1-Score: 0.997


['traditional_ml_pipeline.pkl']

In [18]:
decoded_preds = label_encoder.inverse_transform(test_preds)
decoded_preds

array(['Heart hurts', 'Stomach ache', 'Infected wound', 'Hard to breath',
       'Muscle pain', 'Feeling cold', 'Emotional pain', 'Internal pain',
       'Hard to breath', 'Feeling cold', 'Hard to breath',
       'Hair falling out', 'Heart hurts', 'Heart hurts', 'Stomach ache',
       'Shoulder pain', 'Skin issue', 'Emotional pain', 'Blurry vision',
       'Feeling dizzy', 'Body feels weak', 'Shoulder pain',
       'Feeling dizzy', 'Feeling dizzy', 'Blurry vision', 'Knee pain',
       'Acne', 'Cough', 'Skin issue', 'Blurry vision', 'Open wound',
       'Skin issue', 'Foot ache', 'Open wound', 'Body feels weak',
       'Internal pain', 'Muscle pain', 'Body feels weak', 'Feeling cold',
       'Knee pain', 'Acne', 'Skin issue', 'Neck pain', 'Emotional pain',
       'Neck pain', 'Stomach ache', 'Open wound', 'Shoulder pain',
       'Injury from sports', 'Hard to breath', 'Injury from sports',
       'Hair falling out', 'Joint pain', 'Knee pain', 'Hard to breath',
       'Cough', 'Head ache

In [22]:
for test_i, pred in zip(test_df['clean_phrase'],decoded_preds):
    print(test_i," : ",pred)

feel like heart fire  :  Heart hurts
feel abdominal pain  :  Stomach ache
think wound infect  :  Infected wound
feel tightness chest  :  Hard to breath
feel well muscle  :  Muscle pain
chill ache  :  Feeling cold
grand father die feel hard emotional pain  :  Emotional pain
head pain single day  :  Internal pain
feel like squeeze lung  :  Hard to breath
cold wear layer  :  Feeling cold
feel great pressure chest  :  Hard to breath
shower drain hair time  :  Hair falling out
heart beat fast scare  :  Heart hurts
heart hurt sad  :  Heart hurts
feel nauseous  :  Stomach ache
feel like go acupuncture practice 100 needle shoulder  :  Shoulder pain
dark arm  :  Skin issue
feel pain think  :  Emotional pain
have hard time read letter fuzzy  :  Blurry vision
awake morning feel strange vertigo  :  Feeling dizzy
feel weak body  :  Body feels weak
throb shoulder  :  Shoulder pain
feel dizzy sight  :  Feeling dizzy
stand feel dizzy know  :  Feeling dizzy
blurry vision wrong medicine  :  Blurry visio