In [2]:
%pip install -q "spacy>3.0.0" pandas sklearn 

Note: you may need to restart the kernel to use updated packages.


In [3]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [2]:
import pandas as pd

pd.options.display.max_colwidth = None
pd.options.display.max_rows = 6
data = pd.read_csv('Symptom2Disease.csv')
data

Unnamed: 0,id,diganosis,diganosis.1,text
0,0,PSORIASIS,Psoriasis,"I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches."
1,1,PSORIASIS,Psoriasis,"My skin has been peeling, especially on my knees, elbows, and scalp. This peeling is often accompanied by a burning or stinging sensation."
2,2,PSORIASIS,Psoriasis,"I have been experiencing joint pain in my fingers, wrists, and knees. The pain is often achy and throbbing, and it gets worse when I move my joints."
...,...,...,...,...
1197,297,DIABETES,diabetes,I regularly experience these intense urges and the want to urinate. I frequently feel drowsy and lost. I've also significantly lost my vision.
1198,298,DIABETES,diabetes,"I have trouble breathing, especially outside. I start to feel hot and start to sweat. I frequently have urinary tract infections and yeast infections."
1199,299,DIABETES,diabetes,"I constantly sneeze and have a dry cough. My infections don't seem to be healing, and I have palpitations. My throat does ache occasionally, but it usually gets better."


In [3]:
cats = data.diganosis.unique().tolist()
cats

['PSORIASIS',
 'VARICOSE VEINS',
 'TYPHOID',
 'CHICKEN POX',
 'IMPETIGO',
 'DENGUE',
 'FUNGAL INFECTION',
 'COMMON COLD',
 'PNEUMONIA',
 'DIMORPHIC HEMORRHOIDS',
 'ARTHRITIS',
 'ACNE',
 'BRONCHIAL ASTHMA',
 'HYPERTENSION',
 'MIGRAINE',
 'CERVICAL SPONDYLOSIS',
 'JAUNDICE',
 'MALARIA',
 'URINARY TRACT INFECTION',
 'ALLERGY',
 'GASTROESOPHAGEAL REFLUX DISEASE',
 'DRUG REACTION',
 'PEPTIC ULCER DISEASE',
 'DIABETES']

In [4]:
from typing import Set, List, Tuple 
from spacy.tokens import DocBin
import spacy

#load spacy pretrained model that we downloaded before 
nlp = spacy.load("en_core_web_md")

# Create function to ccreate spacy dataset
def make_docs(data: List[Tuple[str, str]], target_file: str, cats: Set[str]):
    docs = DocBin()
    # use nlp.pipe to efficiently process a large number of text inputs,
    # the as_tuple arguments enables giving a list of tuples as input 
    # and reuse it in the loop, here for the labels 
    
    for doc, label in nlp.pipe(data, as_tuples=True):
        # Encode the labels (assign 1 to diganosis)
        for cat in cats: 
            doc.cats[cat] = 1 if cat == label else 0
        docs.add(doc)
        docs.to_disk(target_file)
        return docs 

In [5]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(data["text"].values, data["diganosis"].values, test_size=0.3)
make_docs(list(zip(X_train, y_train)), "train.spacy", cats=cats)
make_docs(list(zip(X_test, y_test)), "test.spacy", cats=cats)



<spacy.tokens._serialize.DocBin at 0x20a8b3b7d30>

In [6]:
from spacy.cli.train import train as spacy_train 

config_path = "config.cfg"
output_model_path = "./output/spacy_textcat"
spacy_train(
    config_path, 
    output_path=output_model_path,
    overrides={
        "paths.train":"train.spacy",
        "paths.dev":"test.spacy"
    }
)

[38;5;4mℹ Saving to output directory: output\spacy_textcat[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.04        0.00    0.00
200     200          0.80        0.00    0.00
400     400          0.01        0.00    0.00
600     600          0.00        0.00    0.00
800     800          0.00        0.00    0.00
1000    1000          0.00        0.00    0.00
1200    1200          0.00        0.00    0.00
1400    1400          0.00        0.00    0.00
1600    1600          0.00        0.00    0.00
[38;5;2m✔ Saved pipeline to output directory[0m
output\spacy_textcat\model-last


In [26]:
import spacy 
trained_nlp = spacy.load("./output/spacy_textcat/model-best")
 
text = "I have a headache"
doc = trained_nlp(text)
doc.cats

{'PSORIASIS': 0.04082758352160454,
 'VARICOSE VEINS': 0.04082758352160454,
 'TYPHOID': 0.04082758352160454,
 'CHICKEN POX': 0.04082758352160454,
 'IMPETIGO': 0.04082758352160454,
 'DENGUE': 0.04082758352160454,
 'FUNGAL INFECTION': 0.04082758352160454,
 'COMMON COLD': 0.04082758352160454,
 'PNEUMONIA': 0.04082758352160454,
 'DIMORPHIC HEMORRHOIDS': 0.06096552684903145,
 'ARTHRITIS': 0.04082758352160454,
 'ACNE': 0.04082758352160454,
 'BRONCHIAL ASTHMA': 0.04082758352160454,
 'HYPERTENSION': 0.04082758352160454,
 'MIGRAINE': 0.04082758352160454,
 'CERVICAL SPONDYLOSIS': 0.04082758352160454,
 'JAUNDICE': 0.04082758352160454,
 'MALARIA': 0.04082758352160454,
 'URINARY TRACT INFECTION': 0.04082758352160454,
 'ALLERGY': 0.04082758352160454,
 'GASTROESOPHAGEAL REFLUX DISEASE': 0.04082758352160454,
 'DRUG REACTION': 0.04082758352160454,
 'PEPTIC ULCER DISEASE': 0.04082758352160454,
 'DIABETES': 0.04082758352160454}