In [21]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize, corpus         
from nltk.stem import WordNetLemmatizer
from pipeline_nodes import convert_categorical_to_dummy

In [2]:
specialty_cols = ['Code', 'Grouping', 'Classification', 'Specialization', 'Definition']
specialty_df = pd.read_csv('https://s3-us-west-1.amazonaws.com/physician-referral-graph/nucc_taxonomy_180.csv', dtype=str, usecols=specialty_cols)

In [3]:
specialty_df.replace('Definition to come...', np.nan, inplace=True)

In [4]:
specialty_categorical_cols = ['Grouping', 'Classification', 'Specialization']
specialty_df_dummies = convert_categorical_to_dummy(specialty_df, specialty_categorical_cols)

In [5]:
specialty_df_dummies.shape

(854, 729)

In [6]:
wordnet = WordNetLemmatizer()

In [7]:
stopwords = set(corpus.stopwords.words('english') + list(string.punctuation))

In [22]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    return [wordnet.lemmatize(word) for word in word_tokenize(re.sub('[^a-z\s]', '', doc.lower()))]

In [23]:
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)

In [24]:
vectors = vectorizer.fit_transform(specialty_df_dummies['Definition'].fillna('')).toarray()

In [25]:
words = vectorizer.get_feature_names()

In [26]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[-1:-n-1:-1]]

In [27]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
get_top_values(avg, 10, words)

['marriage',
 'come',
 'coding',
 'definition',
 'dance',
 'music',
 'obesity',
 'jx',
 'inactive',
 'poetry']

In [28]:
total = np.sum(vectors, axis=0)
get_top_values(total, 10, words)

['care',
 'patient',
 'service',
 'treatment',
 'disease',
 'individual',
 'health',
 'disorder',
 'facility',
 'medicine']

In [29]:
tokenized_queries = vectorizer.transform(['primary care provider', 'sickness', 'elderly', 'urgent care', 'preventive care', 'behaviour change'])
cosine_similarities = linear_kernel(tokenized_queries, vectors)
titles = specialty_df['Grouping'].astype(str) + specialty_df['Classification'].astype(str) + specialty_df['Specialization'].astype(str)

In [30]:
for i, definition in enumerate(['primary care provider', 'sickness', 'elderly', 'urgent care', 'preventive care', 'behaviour change']):
    print(i)
    print(definition)
    print(get_top_values(cosine_similarities[i], 3, titles))

0
primary care provider
['Dental ProvidersDentistGeneral Practice', 'Managed Care OrganizationsExclusive Provider Organizationnan', 'Allopathic & Osteopathic PhysiciansPediatricsNeonatal-Perinatal Medicine']
1
sickness
['Other Service ProvidersPrevention Professionalnan', 'Allopathic & Osteopathic PhysiciansObstetrics & GynecologyHospice and Palliative Medicine', 'Allopathic & Osteopathic PhysiciansObstetrics & GynecologyFemale Pelvic Medicine and Reconstructive Surgery']
2
elderly
['Allopathic & Osteopathic PhysiciansPsychiatry & NeurologyGeriatric Psychiatry', 'Nursing & Custodial Care FacilitiesCustodial Care FacilityAdult Care Home', 'Allopathic & Osteopathic PhysiciansInternal MedicineGeriatric Medicine']
3
urgent care
['Hospital UnitsEpilepsy Unitnan', 'SuppliersNon-Pharmacy Dispensing Sitenan', 'Respiratory, Developmental, Rehabilitative and Restorative Service ProvidersRespiratory Therapist, RegisteredSNF/Subacute Care']
4
preventive care
['Allopathic & Osteopathic PhysiciansPr

In [31]:
description_vect_df = pd.DataFrame(vectors, columns=words)

In [32]:
description_vect_df.columns

Index(['abdomen', 'abdominal', 'ability', 'ablation', 'able', 'abnormal',
       'abnormality', 'abortion', 'abortionfamily', 'absence',
       ...
       'writing', 'written', 'x', 'xray', 'xrays', 'year', 'young', 'zift',
       'zone', 'zygote'],
      dtype='object', length=3329)

In [39]:
combined_df = pd.concat([specialty_df, description_vect_df], axis=1)

In [41]:
combined_df.shape

(1708, 3333)

In [40]:
combined_df

Unnamed: 0,Grouping,Classification,Specialization,Definition,abdomen,abdominal,ability,ablation,able,abnormal,...,writing,written,x,xray,xrays,year,young,zift,zone,zygote
101Y00000X,Behavioral Health & Social Service Providers,Counselor,,A provider who is trained and educated in the ...,,,,,,,...,,,,,,,,,,
101YA0400X,Behavioral Health & Social Service Providers,Counselor,Addiction (Substance Use Disorder),,,,,,,,...,,,,,,,,,,
101YM0800X,Behavioral Health & Social Service Providers,Counselor,Mental Health,,,,,,,,...,,,,,,,,,,
101YP1600X,Behavioral Health & Social Service Providers,Counselor,Pastoral,,,,,,,,...,,,,,,,,,,
101YP2500X,Behavioral Health & Social Service Providers,Counselor,Professional,,,,,,,,...,,,,,,,,,,
101YS0200X,Behavioral Health & Social Service Providers,Counselor,School,,,,,,,,...,,,,,,,,,,
102L00000X,Behavioral Health & Social Service Providers,Psychoanalyst,,"Psychoanalysis is a comprehensive, theoretical...",,,,,,,...,,,,,,,,,,
102X00000X,Behavioral Health & Social Service Providers,Poetry Therapist,,A medical or mental health professional who ha...,,,,,,,...,,,,,,,,,,
103G00000X,Behavioral Health & Social Service Providers,Clinical Neuropsychologist,,"An individual with a doctorate degree, licensu...",,,,,,,...,,,,,,,,,,
103GC0700X,Behavioral Health & Social Service Providers,Clinical Neuropsychologist,Clinical,,,,,,,,...,,,,,,,,,,


In [35]:
specialty_df.shape

(854, 4)

In [36]:
description_vect_df.shape

(854, 3329)