In [3]:
import re, time
import pandas as pd
import pyspark as ps
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, ArrayType, DoubleType
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.mllib.linalg.distributed import RowMatrix

In [5]:
spark = ps.sql.SparkSession.builder \
            .appName("capstone") \
            .master("local[8]") \
            .getOrCreate()
#             .config('spark.driver.extraClassPath','postgresql-9.1-901-1.jdbc4.jar') \
            

sc = spark.sparkContext  # for the pre-2.0 sparkContext

In [6]:
#Check: Spark Context for RDDS and SQL Context for Dataframes
sc, sqlCtx

(<SparkContext master=local[4] appName=PySparkShell>,
 <pyspark.sql.context.SQLContext at 0x1149080b8>)

In [14]:
# link = 's3n://gschoolcapstone/npidata_20050523-20170813.csv'
link = '../data/npidata_20050523-20170813.csv'
df = spark.read.csv(link, header=True, inferSchema=True).limit(10)

In [17]:
df.write.csv('subset')

In [16]:
df.limit(5).toPandas()

Unnamed: 0,NPI,Entity Type Code,Replacement NPI,Employer Identification Number (EIN),Provider Organization Name (Legal Business Name),Provider Last Name (Legal Name),Provider First Name,Provider Middle Name,Provider Name Prefix Text,Provider Name Suffix Text,...,Healthcare Provider Taxonomy Group_6,Healthcare Provider Taxonomy Group_7,Healthcare Provider Taxonomy Group_8,Healthcare Provider Taxonomy Group_9,Healthcare Provider Taxonomy Group_10,Healthcare Provider Taxonomy Group_11,Healthcare Provider Taxonomy Group_12,Healthcare Provider Taxonomy Group_13,Healthcare Provider Taxonomy Group_14,Healthcare Provider Taxonomy Group_15
0,1679576722,1,,,,WIEBE,DAVID,A,,,...,,,,,,,,,,
1,1588667638,1,,,,PILCHER,WILLIAM,C,DR.,,...,,,,,,,,,,
2,1497758544,2,,<UNAVAIL>,"CUMBERLAND COUNTY HOSPITAL SYSTEM, INC",,,,,,...,,,,,,,,,,
3,1306849450,1,,,,SMITSON,HAROLD,LEROY,DR.,II,...,,,,,,,,,,
4,1215930367,1,,,,GRESSOT,LAURENT,,DR.,,...,,,,,,,,,,


In [18]:
# Rename columns in proper format
cols = df.columns
new_cols = [col.replace('(', '').replace(')', '').replace('.', '').replace(' ', '_') for col in cols]
for old, new in zip(cols, new_cols):
    df = df.withColumnRenamed(old, new)

In [None]:
# df.columns

In [23]:
# Filter where NPIs are active
df = df.filter(df.Entity_Type_Code.isNotNull())

In [28]:
# Provider_Gender_Code: M, F, null, GUTHMILLER
# df.select('Provider_Gender_Code').filter("Provider_Gender_Code == 'GUTHMILLER'").show()
df = df.withColumn('Gender', regexp_replace(df.Provider_Gender_Code, 'GUTHMILLER', 'X'))

In [None]:
df.createOrReplaceTempView("npi")
spark.sql('SELECT Gender FROM npi GROUP BY Gender').show()

In [None]:
# Fill na values
na_dict = {'Gender': 'X', 
           'Is_Sole_Proprietor': 'X', 
           'Is_Organization_Subpart': 'X',
           'Provider_Credential_Text': 'X'}
#            'Healthcare_Provider_Taxonomy_Code_1': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_2': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_3': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_4': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_5': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_6': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_7': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_8': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_9': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_10': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_11': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_12': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_13': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_14': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_15': 'NA', 
df = df.na.fill(na_dict)

In [None]:
def formatting(x):
    x = x.upper()
    x = re.sub(re.compile('\.|\>|\`'), '', x)
    x = re.sub(re.compile('\,|\;|\-|\(|\)|\/'), ' ', x)
    x = re.sub(re.compile('\s+'), ' ', x)
#     x = x.replace('M D', 'MD')
    x = re.sub("M D", "MD", x)
    x = re.sub("D C", "DC", x)
    x = re.sub("P C", "PC", x)
    x = re.sub("D P M", "DPM", x)
    x = re.sub("D O", "DO", x)
    x = re.sub("O D", "OD", x)
    x = re.sub("0D", "OD", x)
    x = re.sub("PHARMD", "RPH ", x)
    x = re.sub("PHYSICIAN ASSISTANT", "PA", x)
    x = re.sub("NURSE PRACTITIONER", "NP", x)
    x = re.sub("PHYSICAL THERAPIST", "PT", x)
    x = re.sub("(BS IN PHARMACY|BS PHARMACY|DOCTOR OF PHARMACY|PHARMACIST|PHARMD)", " RPH ", x)
    x = re.sub("[\d]", "", x) # remove numbers
    x = x.strip()
    return x

format_udf = udf(formatting, StringType())
# print(formatting('hey. this is john. . . .'))

In [None]:
# df = df.withColumn('test', format_udf(col('Provider_Credential_Text')))
# df.show(5)
# test = df.withColumn('new', regexp_replace(df.Provider_Credential_Text, '\.', ''))
# test.select('Provider_Credential_Text','new').show(10)

df = df.withColumn('Credentials', format_udf(df.Provider_Credential_Text))
# df.select('Provider_Credential_Text', 'Credentials').show(5)

In [None]:
df.select('Gender','Is_Sole_Proprietor','Is_Organization_Subpart','Credentials').show(5)

### Spark pipeline to get feature vector

In [None]:
stages, feature_cols = [], []

In [None]:
for col in my_cols:
    stages.append(StringIndexer(inputCol=col, outputCol=col+'_idx', handleInvalid='error'))
    stages.append(OneHotEncoder(dropLast=True, inputCol=col+'_idx', outputCol=col+'_ohe'))
    feature_cols.append(col+'_ohe')
stages.append(VectorAssembler(inputCols=feature_cols, outputCol='features'))

In [None]:
# df = df.drop('Provider_Gender_Code_idx')
# df = df.drop('Provider_Gender_Code_ohe')

In [None]:
col = 'Provider_Gender_Code'
stridx = StringIndexer(inputCol=col, outputCol=col+'_idx', handleInvalid='error')

In [None]:
# model = stridx.fit(df)
# df = model.transform(df)
# {i: label for i, label in enumerate(model.labels)}

In [None]:
# df.select('Gender').show()

In [None]:
# df.columns

In [None]:
ohe = OneHotEncoder(dropLast=True, inputCol=col+'_idx', outputCol=col+'_ohe')

In [None]:
# df = ohe.transform(df)

In [None]:
# df.columns

In [None]:
# df.select('Gender').show()
# df.select('Gender_').show()

In [None]:
features = [col+'_ohe']
va = VectorAssembler(inputCols=features, outputCol='features')

https://spark.apache.org/docs/1.6.1/ml-guide.html#example-pipeline

In [None]:
pipeline = Pipeline(stages = [stridx, ohe, va])
model = pipeline.fit(df)
df = model.transform(df)

In [None]:
df.select('NPI', 'features').show(5)

In [None]:
# cache processed dataframe/model
df.persist() 
# df.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
# df.unpersist()