In [45]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from pipeline_nodes import convert_categorical_to_dummy

In [9]:
specialty_cols = ['Code', 'Grouping', 'Classification', 'Specialization', 'Definition']
specialty_df = pd.read_csv('https://s3-us-west-1.amazonaws.com/physician-referral-graph/nucc_taxonomy_180.csv', index_col=['Code'], dtype=str, usecols=specialty_cols)

In [42]:
specialty_df.replace('Definition to come...', np.nan, inplace=True)

In [47]:
specialty_categorical_cols = ['Grouping', 'Classification', 'Specialization']
specialty_df_dummies = convert_categorical_to_dummy(specialty_df, specialty_categorical_cols)

In [68]:
specialty_df_dummies.shape

(854, 729)

In [52]:
wordnet = WordNetLemmatizer()

In [53]:
def tokenize(doc):
    '''
    INPUT: string
    OUTPUT: list of strings

    Tokenize and stem/lemmatize the document.
    '''
    return [wordnet.lemmatize(word) for word in word_tokenize(doc.lower())]

In [54]:
tfidfvect = TfidfVectorizer(stop_words='english', tokenizer=tokenize)

In [64]:
tfidfvectorizer = tfidfvect.fit_transform(specialty_df_dummies['Definition'].fillna(' '))

In [63]:
tfidfvect.vocabulary_

{'provider': 2497,
 'trained': 3118,
 'educated': 1000,
 'performance': 2258,
 'behavior': 359,
 'health': 1394,
 'service': 2836,
 'interpersonal': 1632,
 'communication': 591,
 'analysis': 210,
 '.': 11,
 'training': 3120,
 'education': 1002,
 'specialty': 2911,
 'level': 1738,
 'usually': 3212,
 'requires': 2690,
 'master': 1834,
 '’': 3316,
 's': 2763,
 'degree': 816,
 'clinical': 541,
 'experience': 1162,
 'supervision': 3013,
 'licensure': 1745,
 'certification': 482,
 'psychoanalysis': 2508,
 'comprehensive': 619,
 ',': 8,
 'theoretical': 3079,
 'framework': 1277,
 'applied': 243,
 'treatment': 3146,
 'process': 2450,
 'consists': 655,
 'intensive': 1614,
 'verbal': 3237,
 'therapeutic': 3081,
 'relationship': 2655,
 'analyst': 211,
 'analysand': 209,
 'aim': 171,
 'symptom': 3038,
 'relief': 2660,
 'emotional': 1049,
 'growth': 1366,
 'personal': 2279,
 'integration': 1608,
 'psychoanalytic': 2509,
 'includes': 1530,
 'limited': 1760,
 'recognition': 2581,
 'unconscious': 3173,

In [67]:
tfidfvectorizer.shape

(854, 3317)

In [69]:
type(tfidfvectorizer)

scipy.sparse.csr.csr_matrix

In [72]:
tfidfvect.get_feature_names()

['%',
 '&',
 "'",
 "''",
 "'hospitalist",
 "'s",
 '(',
 ')',
 ',',
 '-',
 '-that',
 '.',
 '/',
 '/strong',
 '0',
 '1',
 '12',
 '1603',
 '1974',
 '1988',
 '1993',
 '1996',
 '2',
 '20',
 '200',
 '21',
 '24',
 '24-hour',
 '25',
 '282j00000x',
 '3',
 '4',
 '42',
 '450',
 '493.1405',
 '5',
 '50',
 '6',
 '797-compliant',
 ':',
 ';',
 '<',
 '>',
 '[',
 ']',
 '``',
 'a.d.n',
 'a10',
 'abdomen',
 'abdominal',
 'ability',
 'ablation',
 'able',
 'abnormal',
 'abnormality',
 'abortion',
 'abortion/family',
 'absence',
 'absent',
 'absorption',
 'abuse',
 'academic',
 'accept',
 'acceptance',
 'access',
 'accessible',
 'accessing',
 'accident',
 'accommodate',
 'accommodation',
 'accomplished',
 'accord',
 'accordance',
 'according',
 'accountability',
 'accountable',
 'accreditation',
 'accredited',
 'accuracy',
 'accurate',
 'accurately',
 'acetabular',
 'achievable',
 'achieve',
 'achievement',
 'acms',
 'acms-approved',
 'acnm',
 'acote',
 'acquire',
 'acquired',
 'acquiring',
 'acquisition',
 