In [52]:
# Emily Sear
# CSC 417
# Final Project - NLP to create a diganosis predictor given a brief description of symptoms

In [1]:
import pandas as pd
import spacy as spacy

In [2]:
nlp = spacy.blank("en")
data = pd.read_csv('Symptom2Disease.csv')
data.shape[0]
data.head()

Unnamed: 0,id,diganosis,diganosis.1,text
0,0,PSORIASIS,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,PSORIASIS,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,PSORIASIS,Psoriasis,I have been experiencing joint pain in my fing...
3,3,PSORIASIS,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,PSORIASIS,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,id,diganosis,diganosis.1,text
0,62,ALLERGY,allergy,I have trouble breathing and get short of brea...
1,72,HYPERTENSION,Hypertension,"I've had intense aches in my chest, a head pa..."
2,43,FUNGAL INFECTION,Fungal infection,"There are red, bumpy areas on my skin, and I'v..."
3,173,DIMORPHIC HEMORRHOIDS,Dimorphic Hemorrhoids,It's incredibly difficult for me to use the re...
4,236,JAUNDICE,Jaundice,"I've been feeling scratchy, sick, and worn out..."


In [4]:
data = list(data[["text", "diganosis"]].sample(frac=1).itertuples(index=False, name=None))


In [5]:
# split the data (spacy does not work well with sklearn.train_test_split)
train_data = data[:900]
dev_data = data[900:1080]
test_data = data[1080:]

In [6]:
# convert data 
def convert(data, outfile): 
    db = spacy.tokens.DocBin()
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        doc.cats[label] = True
        db.add(doc)
    db.to_disk(outfile)

In [7]:
# convert the data into a way the spacy understands 
convert(train_data, "./train_data.spacy")
convert(dev_data, "./dev_data.spacy")
convert(test_data, "./test_data.spacy")

In [13]:
# create the config file 
!python -m spacy init config  --lang en --pipeline textcat --optimize efficiency --force config.cfg

[i] Generated config template specific for your use case
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [14]:
# train the data 

!python -m spacy train config.cfg --paths.train ./train_data.spacy --paths.dev ./dev_data.spacy --output model-accuracy --verbose

[+] Created output directory: model-accuracy
[i] Saving to output directory: model-accuracy
[i] Using CPU
[1m
[+] Initialized pipeline
[1m
[i] Pipeline: ['textcat']
[i] Initial learn rate: 0.001
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.04        0.00    0.00
  0     200          6.54       27.20    0.27
  1     400          3.02       65.14    0.65
  2     600          1.06       84.77    0.85
  3     800          0.32       91.63    0.92
  4    1000          0.13       93.56    0.94
  6    1200          0.05       95.30    0.95
  8    1400          0.03       96.17    0.96
 11    1600          0.02       97.37    0.97
 14    1800          0.01       97.37    0.97
 18    2000          0.01       97.37    0.97
 23    2200          0.01       97.37    0.97
 28    2400          0.00       97.65    0.98
 34    2600          0.00       97.65    0.98
 40    2800          0.00       97.89    0.98
 46    3000          

[2023-05-03 20:44:00,235] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[2023-05-03 20:44:00,525] [INFO] Set up nlp object from config
[2023-05-03 20:44:00,530] [DEBUG] Loading corpus from path: dev_data.spacy
[2023-05-03 20:44:00,530] [DEBUG] Loading corpus from path: train_data.spacy
[2023-05-03 20:44:00,530] [INFO] Pipeline: ['textcat']
[2023-05-03 20:44:00,532] [INFO] Created vocabulary
[2023-05-03 20:44:00,533] [INFO] Finished initializing nlp object
[2023-05-03 20:44:00,916] [INFO] Initialized pipeline components: ['textcat']
[2023-05-03 20:44:00,922] [DEBUG] Loading corpus from path: dev_data.spacy
[2023-05-03 20:44:00,923] [DEBUG] Loading corpus from path: train_data.spacy


In [16]:
# test the data on the test set 

!python -m spacy evaluate ./model/model-best/ ./test_data.spacy 

[i] Using CPU
[1m

TOK                 100.00
TEXTCAT (macro F)   97.34 
SPEED               421771

[1m

                                      P        R        F
PNEUMONIA                        100.00   100.00   100.00
IMPETIGO                         100.00    71.43    83.33
VARICOSE VEINS                   100.00    90.00    94.74
FUNGAL INFECTION                 100.00   100.00   100.00
JAUNDICE                         100.00   100.00   100.00
DIMORPHIC HEMORRHOIDS            100.00   100.00   100.00
MALARIA                          100.00   100.00   100.00
PEPTIC ULCER DISEASE             100.00   100.00   100.00
COMMON COLD                      100.00   100.00   100.00
PSORIASIS                        100.00    60.00    75.00
ALLERGY                          100.00   100.00   100.00
CHICKEN POX                      100.00   100.00   100.00
HYPERTENSION                     100.00   100.00   100.00
DRUG REACTION                    100.00   100.00   100.00
DIABETES              

In [23]:
# test out the model on data that the "user" aka me sets..
# the closer the number is to 1 the more likely the text falls into that category 

import spacy 
trained_nlp = spacy.load("./model/model-best/")
 
text = ["I have been experiencing acidity, indigestion, headaches", "Head hurting in left side of head, neck pain, shoulder pain and I am on my period", "I have been experiencing a sore throat, a stuffy nose and I have a headache"]
docs = list(trained_nlp.pipe(text))
for doc in docs:
    print(doc.text)
    print(f"{doc.cats}\n")


I have been experiencing acidity, indigestion, headaches
{'PNEUMONIA': 0.010443363338708878, 'IMPETIGO': 0.00251981895416975, 'VARICOSE VEINS': 0.010410028509795666, 'FUNGAL INFECTION': 0.013436523266136646, 'JAUNDICE': 0.03267307206988335, 'DIMORPHIC HEMORRHOIDS': 0.010466845706105232, 'MALARIA': 0.026393426582217216, 'PEPTIC ULCER DISEASE': 0.016425149515271187, 'COMMON COLD': 0.008905314840376377, 'PSORIASIS': 0.008089871145784855, 'ALLERGY': 0.0376221239566803, 'CHICKEN POX': 0.0039674220606684685, 'HYPERTENSION': 0.07068631798028946, 'DRUG REACTION': 0.030789611861109734, 'DIABETES': 0.006941908039152622, 'CERVICAL SPONDYLOSIS': 0.027056720107793808, 'URINARY TRACT INFECTION': 0.038606178015470505, 'ARTHRITIS': 0.02877870574593544, 'TYPHOID': 0.03485224395990372, 'DENGUE': 0.018169857561588287, 'MIGRAINE': 0.5283526182174683, 'GASTROESOPHAGEAL REFLUX DISEASE': 0.0102838771417737, 'ACNE': 0.009817141108214855, 'BRONCHIAL ASTHMA': 0.014311939477920532}

Head hurting in left side of 