In [52]:
# Emily Sear
# CSC 417
# Final Project - NLP to create a diganosis predictor given a brief description of symptoms

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy as spacy

In [2]:
nlp = spacy.blank("en")
data = pd.read_csv('Symptom2Disease.csv')
data.shape[0]
data.head()

Unnamed: 0,id,diganosis,diganosis.1,text
0,0,PSORIASIS,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,PSORIASIS,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,PSORIASIS,Psoriasis,I have been experiencing joint pain in my fing...
3,3,PSORIASIS,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,PSORIASIS,Psoriasis,"My nails have small dents or pits in them, and..."


In [3]:
# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,id,diganosis,diganosis.1,text
0,62,ALLERGY,allergy,I have trouble breathing and get short of brea...
1,72,HYPERTENSION,Hypertension,"I've had intense aches in my chest, a head pa..."
2,43,FUNGAL INFECTION,Fungal infection,"There are red, bumpy areas on my skin, and I'v..."
3,173,DIMORPHIC HEMORRHOIDS,Dimorphic Hemorrhoids,It's incredibly difficult for me to use the re...
4,236,JAUNDICE,Jaundice,"I've been feeling scratchy, sick, and worn out..."


In [4]:
data = list(data[["text", "diganosis"]].sample(frac=1).itertuples(index=False, name=None))


In [5]:
train_data = data[:900]
dev_data = data[900:1080]
test_data = data[1080:]

In [6]:
# convert data 
def convert(data, outfile): 
    db = spacy.tokens.DocBin()
    docs = []
    for doc, label in nlp.pipe(data, as_tuples=True):
        doc.cats[label] = True
        db.add(doc)
    db.to_disk(outfile)

In [7]:
convert(train_data, "./train_data.spacy")
convert(dev_data, "./dev_data.spacy")
convert(test_data, "./test_data.spacy")

In [13]:
!python -m spacy init config  --lang en --pipeline textcat --optimize efficiency --force config.cfg

[i] Generated config template specific for your use case
- Language: en
- Pipeline: textcat
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[+] Auto-filled config with all values
[+] Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train config.cfg --paths.train ./train_data.spacy --paths.dev ./dev_data.spacy --output model-accuracy --verbose

In [10]:
!python -m spacy evaluate ./model/model-best/ ./test_data.spacy 

[i] Using CPU
[1m

TOK                 100.00
TEXTCAT (macro F)   97.34 
SPEED               410293

[1m

                                      P        R        F
PNEUMONIA                        100.00   100.00   100.00
IMPETIGO                         100.00    71.43    83.33
VARICOSE VEINS                   100.00    90.00    94.74
FUNGAL INFECTION                 100.00   100.00   100.00
JAUNDICE                         100.00   100.00   100.00
DIMORPHIC HEMORRHOIDS            100.00   100.00   100.00
MALARIA                          100.00   100.00   100.00
PEPTIC ULCER DISEASE             100.00   100.00   100.00
COMMON COLD                      100.00   100.00   100.00
PSORIASIS                        100.00    60.00    75.00
ALLERGY                          100.00   100.00   100.00
CHICKEN POX                      100.00   100.00   100.00
HYPERTENSION                     100.00   100.00   100.00
DRUG REACTION                    100.00   100.00   100.00
DIABETES              

In [12]:
import spacy 
trained_nlp = spacy.load("./model/model-best/")
 
text = "I have been experiencing acidity, indigestion, headaches, and blurred and distorted vision, as well as excessive hunger, a stiff neck, depression, irritability, and visual disturbance."
docs = list(trained_nlp.pipe(text))
for doc in docs:
    print(doc.text)
    print(f"{doc.cats}\n")


{'PNEUMONIA': 9.685400073067285e-07,
 'IMPETIGO': 2.69296492660942e-07,
 'VARICOSE VEINS': 1.3929277997704048e-07,
 'FUNGAL INFECTION': 1.839542619563872e-06,
 'JAUNDICE': 2.51103278969822e-06,
 'DIMORPHIC HEMORRHOIDS': 1.7032590449161944e-07,
 'MALARIA': 1.1665860256471206e-05,
 'PEPTIC ULCER DISEASE': 1.0086430393130286e-06,
 'COMMON COLD': 1.0708304216677789e-06,
 'PSORIASIS': 3.272673438914353e-07,
 'ALLERGY': 2.8842666779382853e-06,
 'CHICKEN POX': 3.658290026464783e-08,
 'HYPERTENSION': 6.0253700212342665e-05,
 'DRUG REACTION': 6.926927653694293e-07,
 'DIABETES': 1.1546692348929355e-06,
 'CERVICAL SPONDYLOSIS': 2.866413888114039e-05,
 'URINARY TRACT INFECTION': 6.055526569070935e-07,
 'ARTHRITIS': 3.4451118153810967e-06,
 'TYPHOID': 2.29320448852377e-06,
 'DENGUE': 5.004833951716137e-07,
 'MIGRAINE': 0.999873161315918,
 'GASTROESOPHAGEAL REFLUX DISEASE': 9.020874927045952e-07,
 'ACNE': 1.5371513484296884e-07,
 'BRONCHIAL ASTHMA': 5.105261607241118e-06}

In [73]:
def diganoses_chatbot():
    print("this is your diganoses chatbot, please list your symptoms and I will predict your diganoses for you.")
    textDoc = input("What are your symptonms>\n")
    print(textDoc)
    

In [66]:
diganoses_chatbot()

this is your diganoses chatbot, please list your symptoms and I will predict your diganoses for you.
What are your symptonms>
Head hurting in left side of head, neck pain, shoulder pain and I am on my period
Head hurting in left side of head, neck pain, shoulder pain and I am on my period
