In [1]:
import numpy as np
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize, corpus         
from nltk.stem import WordNetLemmatizer
from pipeline_nodes import convert_categorical_to_dummy

In [2]:
specialty_cols = ['Code', 'Grouping', 'Classification', 'Specialization', 'Definition']
specialty_df = pd.read_csv('https://s3-us-west-1.amazonaws.com/physician-referral-graph/nucc_taxonomy_180.csv', dtype=str, usecols=specialty_cols)

In [3]:
specialty_df.replace('Definition to come...', np.nan, inplace=True)

In [4]:
specialty_categorical_cols = ['Grouping', 'Classification', 'Specialization']
specialty_df_dummies = convert_categorical_to_dummy(specialty_df, specialty_categorical_cols)

In [5]:
specialty_df_dummies.shape

(854, 730)

In [6]:
wordnet = WordNetLemmatizer()

In [7]:
stopwords = set(corpus.stopwords.words('english') + list(string.punctuation))

In [8]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    return [wordnet.lemmatize(word) for word in word_tokenize(re.sub('[^a-z\s]', '', doc.lower()))]

In [9]:
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)

In [10]:
vectors = vectorizer.fit_transform(specialty_df_dummies['Definition'].fillna('')).toarray()

In [11]:
words = vectorizer.get_feature_names()

In [12]:
def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[-1:-n-1:-1]]

In [13]:
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
get_top_values(avg, 10, words)

['marriage',
 'come',
 'coding',
 'definition',
 'dance',
 'music',
 'obesity',
 'jx',
 'inactive',
 'poetry']

In [14]:
total = np.sum(vectors, axis=0)
get_top_values(total, 10, words)

['care',
 'patient',
 'service',
 'treatment',
 'disease',
 'individual',
 'health',
 'disorder',
 'facility',
 'medicine']

In [15]:
tokenized_queries = vectorizer.transform(['primary care provider', 'sickness', 'elderly', 'urgent care', 'preventive care', 'behaviour change'])
cosine_similarities = linear_kernel(tokenized_queries, vectors)
titles = specialty_df['Grouping'].astype(str) + specialty_df['Classification'].astype(str) + specialty_df['Specialization'].astype(str)

In [16]:
for i, definition in enumerate(['primary care provider', 'sickness', 'elderly', 'urgent care', 'preventive care', 'behaviour change']):
    print(i)
    print(definition)
    print(get_top_values(cosine_similarities[i], 3, titles))

0
primary care provider
['Dental ProvidersDentistGeneral Practice', 'Managed Care OrganizationsExclusive Provider Organizationnan', 'Allopathic & Osteopathic PhysiciansPediatricsNeonatal-Perinatal Medicine']
1
sickness
['Other Service ProvidersPrevention Professionalnan', 'Allopathic & Osteopathic PhysiciansObstetrics & GynecologyHospice and Palliative Medicine', 'Allopathic & Osteopathic PhysiciansObstetrics & GynecologyFemale Pelvic Medicine and Reconstructive Surgery']
2
elderly
['Allopathic & Osteopathic PhysiciansPsychiatry & NeurologyGeriatric Psychiatry', 'Nursing & Custodial Care FacilitiesCustodial Care FacilityAdult Care Home', 'Allopathic & Osteopathic PhysiciansInternal MedicineGeriatric Medicine']
3
urgent care
['Hospital UnitsEpilepsy Unitnan', 'SuppliersNon-Pharmacy Dispensing Sitenan', 'Respiratory, Developmental, Rehabilitative and Restorative Service ProvidersRespiratory Therapist, RegisteredSNF/Subacute Care']
4
preventive care
['Allopathic & Osteopathic PhysiciansPr

In [17]:
description_vect_df = pd.DataFrame(vectors, columns=words)

In [18]:
description_vect_df.columns

Index(['abdomen', 'abdominal', 'ability', 'ablation', 'able', 'abnormal',
       'abnormality', 'abortion', 'abortionfamily', 'absence',
       ...
       'writing', 'written', 'x', 'xray', 'xrays', 'year', 'young', 'zift',
       'zone', 'zygote'],
      dtype='object', length=3329)

In [19]:
combined_df = pd.concat([specialty_df_dummies, description_vect_df], axis=1).drop(['Definition'], axis=1)

In [21]:
d = {}
for col in combined_df.columns:
    d[col] = combined_df[col].value_counts

# Let's play with Physician NPI

In [23]:
physician_cols = ['NPI', 'Entity Type Code', 'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 'Provider Business Practice Location Address Country Code (If outside US)', 'Provider Gender Code', 'Healthcare Provider Taxonomy Code_1']

In [24]:
physician_dtype = {'NPI':int, 'Entity Type Code':float, 'Provider Business Practice Location Address City Name':str, 'Provider Business Practice Location Address State Name':str, 'Provider Business Practice Location Address Country Code (If outside US)':str, 'Provider Gender Code':str, 'Healthcare Provider Taxonomy Code_1':str}

In [25]:
physician_df = pd.read_csv('../data/samples/npidata_pfile_20050523-20180408_withHeader-subsample.csv', dtype=physician_dtype, usecols=physician_cols)

In [26]:
physician_df['Entity Type Code'].replace([1,2], ['Individual', 'Organization'], inplace=True)

In [27]:
physician_df

Unnamed: 0,NPI,Entity Type Code,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Country Code (If outside US),Provider Gender Code,Healthcare Provider Taxonomy Code_1
0,1881931285,Organization,FRANKLIN,NC,US,,174400000X
1,1992931828,Organization,BALTIMORE,MD,US,,251E00000X
2,1508018706,Individual,MANHATTAN,NY,US,F,363AS0400X
3,1740225770,Individual,CHARLES TOWN,WV,US,M,207Q00000X
4,1609388560,Individual,DANBURY,CT,US,F,163WL0100X
5,1629571955,Individual,SPRINGFIELD GARDENS,NY,US,F,163W00000X
6,1356557623,Individual,ROYAL OAK,MI,US,M,207X00000X
7,1780079145,Individual,BROOKLYN,NY,US,M,390200000X
8,1669461463,Individual,FT LAUDERDALE,FL,US,M,207RC0000X
9,1053710715,Individual,GAINESVILLE,FL,US,M,183500000X


In [32]:
physician_categoricals = ['Entity Type Code', 'Provider Business Practice Location Address City Name', 'Provider Business Practice Location Address State Name', 'Provider Business Practice Location Address Country Code (If outside US)', 'Provider Gender Code']

In [33]:
physician_df_dummies = convert_categorical_to_dummy(physician_df, physician_categoricals)

In [35]:
physician_df_dummies.shape

(1000, 719)

In [29]:
city_state_country

Unnamed: 0,Provider Business Practice Location Address City Name,Provider Business Practice Location Address State Name,Provider Business Practice Location Address Country Code (If outside US)
0,FRANKLIN,NC,US
1,BALTIMORE,MD,US
2,MANHATTAN,NY,US
3,CHARLES TOWN,WV,US
4,DANBURY,CT,US
5,SPRINGFIELD GARDENS,NY,US
6,ROYAL OAK,MI,US
7,BROOKLYN,NY,US
8,FT LAUDERDALE,FL,US
9,GAINESVILLE,FL,US


In [30]:
city_state_country_dummies = convert_categorical_to_dummy(city_state_country, ['Provider Business Practice Location Address City Name','Provider Business Practice Location Address State Name','Provider Business Practice Location Address Country Code (If outside US)'])

In [36]:
city_state_country_dummies.shape

(1000, 713)