In [178]:
import pandas as pd
import pyspark as ps
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, ArrayType
from pyspark.sql.functions import *
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml import Pipeline
from pyspark.mllib.linalg.distributed import RowMatrix

In [42]:
#Checking if Spark Context is running --> RDDS and SQL Context is running --> Dataframes
# sc, sqlCtx

In [43]:
spark = ps.sql.SparkSession.builder \
            .master("local[8]") \
            .appName("capstone") \
            .getOrCreate()

sc = spark.sparkContext  # for the pre-2.0 sparkContext

http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark.sql.DataFrameReader.csv

In [None]:
# schema = StructType( [
#     StructField('NPI', IntegerType(), True), 
#     StructField('Entity Type Code', IntegerType(), True),
#     StructField('Replacement NPI', IntegerType(), True)
#     ] )

In [None]:
link = 's3n://gschoolcapstone/npidata_20050523-20170813.csv'
# .option("maxColumns", 309)
df = spark.read.csv(link, header=True, inferSchema=True)
# .limit(100)

In [None]:
# df.dtypes
# df.printSchema()

In [None]:
# df.write.json('subset')
# df.write.format('json').save('../data/subset.json')

In [113]:
df = spark.read.json('../data/subset.json')

In [None]:
# df.count(), len(df.columns)

In [None]:
# pdf = pd.read_csv(link, nrows=1000)
# pdf['Provider Business Mailing Address State Name'].unique()
# len(pdf.columns)

In [114]:
# Rename columns in proper format
cols = df.columns
new_cols = [col.replace('(', '').replace(')', '').replace('.', '').replace(' ', '_') for col in cols]
for old, new in zip(cols, new_cols):
    df = df.withColumnRenamed(old, new)

http://spark.apache.org/docs/2.2.0/api/python/pyspark.sql.html#pyspark-sql-module

In [115]:
df.createOrReplaceTempView("npi")
# df.registerTempTable('npi')
# spark.sql('SELECT * FROM npi').show()

In [116]:
# Filter where NPIs are active... do updates have reactivated providers?
df = df.filter(df.Entity_Type_Code.isNotNull())
# spark.sql('SELECT Entity_Type_Code FROM npi GROUP BY Entity_Type_Code').show()

In [117]:
# Provider_Gender_Code: M, F, null, GUTHMILLER
# df.select('Provider_Gender_Code').filter("Provider_Gender_Code == 'GUTHMILLER'").show()
df = df.replace('GUTHMILLER', 'X', subset='Provider_Gender_Code')
spark.sql('SELECT Provider_Gender_Code FROM npi GROUP BY Provider_Gender_Code').show()

# HELP: NOT REPLACING STRING VALUES

+--------------------+
|Provider_Gender_Code|
+--------------------+
|                   F|
|                null|
|                   M|
+--------------------+



In [118]:
# Fill na values
na_dict = {'Provider_Gender_Code': 'X', 
           'Is_Sole_Proprietor': 'X', 
           'Is_Organization_Subpart': 'X',
           'Provider_Credential_Text': 'X'}
#            'Healthcare_Provider_Taxonomy_Code_1': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_2': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_3': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_4': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_5': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_6': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_7': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_8': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_9': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_10': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_11': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_12': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_13': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_14': 'NA',
#            'Healthcare_Provider_Taxonomy_Code_15': 'NA', 
df = df.na.fill(na_dict)
df.select('Provider_Credential_Text').show(10)

+------------------------+
|Provider_Credential_Text|
+------------------------+
|                    M.D.|
|                      MD|
|                       X|
|                    M.D.|
|                    M.D.|
|                       X|
|                      MD|
|                  MA-CCC|
|                      MD|
|                   M. D.|
+------------------------+
only showing top 10 rows



In [138]:
import re

In [169]:
def formatting(x):
    x = re.sub(re.compile('\.'), '', x)
    x = re.sub(re.compile('\s'), '', x)
    x = x.replace('M D', 'MD')
    return x

format_udf = udf(formatting, StringType())
# print(formatting('hey. this is john. . . .'))

In [170]:
# df = df.withColumn('test', format_udf(col('Provider_Credential_Text')))
# df.show(5)
# test = df.withColumn('new', regexp_replace(df.Provider_Credential_Text, '\.', ''))
# test.select('Provider_Credential_Text','new').show(10)

test = df.withColumn('new', format_udf(df.Provider_Credential_Text))
test.select('Provider_Credential_Text', 'new').show(10)

+------------------------+------+
|Provider_Credential_Text|   new|
+------------------------+------+
|                    M.D.|    MD|
|                      MD|    MD|
|                       X|     X|
|                    M.D.|    MD|
|                    M.D.|    MD|
|                       X|     X|
|                      MD|    MD|
|                  MA-CCC|MA-CCC|
|                      MD|    MD|
|                   M. D.|    MD|
+------------------------+------+
only showing top 10 rows



### Spark pipeline to get feature vector

In [None]:
stages, feature_cols = [], []

In [None]:
for col in my_cols:
    stages.append(StringIndexer(inputCol=col, outputCol=col+'_idx', handleInvalid='error'))
    stages.append(OneHotEncoder(dropLast=True, inputCol=col+'_idx', outputCol=col+'_ohe'))
    feature_cols.append(col+'_ohe')
stages.append(VectorAssembler(inputCols=feature_cols, outputCol='features'))

In [None]:
# df = df.drop('Provider_Gender_Code_idx')
# df = df.drop('Provider_Gender_Code_ohe')

In [None]:
col = 'Provider_Gender_Code'
stridx = StringIndexer(inputCol=col, outputCol=col+'_idx', handleInvalid='error')

In [None]:
# model = stridx.fit(df)
# df = model.transform(df)
# {i: label for i, label in enumerate(model.labels)}

In [None]:
# df.select('Gender').show()

In [None]:
# df.columns

In [None]:
ohe = OneHotEncoder(dropLast=True, inputCol=col+'_idx', outputCol=col+'_ohe')

In [None]:
# df = ohe.transform(df)

In [None]:
# df.columns

In [None]:
# df.select('Gender').show()
# df.select('Gender_').show()

In [None]:
features = [col+'_ohe']
va = VectorAssembler(inputCols=features, outputCol='features')

https://spark.apache.org/docs/1.6.1/ml-guide.html#example-pipeline

In [None]:
pipeline = Pipeline(stages = [stridx, ohe, va])
model = pipeline.fit(df)
df = model.transform(df)

In [None]:
df.select('NPI', 'features').show(5)

In [None]:
# cache processed dataframe/model
# df.persist() 
# df.persist(pyspark.StorageLevel.MEMORY_AND_DISK)
# df.unpersist()

### MinHash LSH example
https://janzhou.org/lsh/   
https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.feature.MinHashLSH  
https://github.com/apache/spark/blob/master/examples/src/main/python/ml/min_hash_lsh_example.py   
https://github.com/evancasey/spark-knn-recommender/blob/master/algorithms/itemSimilarity.py  

In [44]:
data = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
        (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
        (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),),
        (3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
        (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
        (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
data

[(0, SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0})),
 (1, SparseVector(6, {2: 1.0, 3: 1.0, 4: 1.0})),
 (2, SparseVector(6, {0: 1.0, 2: 1.0, 4: 1.0})),
 (3, SparseVector(6, {1: 1.0, 3: 1.0, 5: 1.0})),
 (4, SparseVector(6, {2: 1.0, 3: 1.0, 5: 1.0})),
 (5, SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}))]

In [54]:
ex = spark.createDataFrame(data, ["id", "features"])
print(type(ex))
ex.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+---+--------------------+
| id|            features|
+---+--------------------+
|  0|(6,[0,1,2],[1.0,1...|
|  1|(6,[2,3,4],[1.0,1...|
|  2|(6,[0,2,4],[1.0,1...|
|  3|(6,[1,3,5],[1.0,1...|
|  4|(6,[2,3,5],[1.0,1...|
|  5|(6,[1,2,4],[1.0,1...|
+---+--------------------+



In [46]:
ex = ex.drop('hashes')
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=4, seed=123)
model = mh.fit(ex)
ex = model.transform(ex)
ex.show()

+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  0|(6,[0,1,2],[1.0,1...|[[-6.7948028E8], ...|
|  1|(6,[2,3,4],[1.0,1...|[[-1.86843801E9],...|
|  2|(6,[0,2,4],[1.0,1...|[[-3.15433227E8],...|
|  3|(6,[1,3,5],[1.0,1...|[[-1.86843801E9],...|
|  4|(6,[2,3,5],[1.0,1...|[[-1.86843801E9],...|
|  5|(6,[1,2,4],[1.0,1...|[[-6.7948028E8], ...|
+---+--------------------+--------------------+



https://spark.apache.org/docs/2.1.1/ml-features.html#approximate-nearest-neighbor-search

In [47]:
id=4
key = ex.filter('id = {}'.format(id)).select('id', 'features').collect()
# print(type(key[0][1])) # <class 'pyspark.ml.linalg.SparseVector'>
# print(key[0][1],'\n') # (6,[2,3,5],[1.0,1.0,1.0]) 

In [48]:
neighbors=2
nn = model.approxNearestNeighbors(ex, key[0][1], neighbors+1, distCol='JaccardDistance').select('id').collect()
print('Top {} most similar to {}:'.format(neighbors, id), [n[0] for n in nn[1:]])

Top 2 most similar to 4: [3, 1]


### Now with NPI data...

In [101]:
cdf = spark.read.csv('../data/npidata_20050523-20170813_clean_dont_overwrite.csv', header=True, inferSchema=True)

In [102]:
feature_cols = cdf.columns[1:]
# feature_cols = list(feature_cols.asDict().values())[1:]

In [239]:
rdd = cdf.rdd
rdd.cache()

MapPartitionsRDD[537] at javaToPython at NativeMethodAccessorImpl.java:0

In [241]:
npi = rdd.map(lambda x: x[0])
features = rdd.map(lambda x: x[1:])
features.cache()

PythonRDD[538] at RDD at PythonRDD.scala:48

In [106]:
va = VectorAssembler(inputCols=feature_cols, outputCol='features')
cdf = va.transform(cdf)
cdf.select("NPI", "features").show(5)

+----------+--------------------+
|       NPI|            features|
+----------+--------------------+
|1679576722|(938,[0,2,37,362,...|
|1588667638|(938,[0,2,5,17,31...|
|1497758544|(938,[1,7,35,662]...|
|1306849450|(938,[0,2,5,51,46...|
|1215930367|(938,[0,2,5,51,23...|
+----------+--------------------+
only showing top 5 rows



In [213]:
cdf = cdf.drop('hashes')
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=10, seed=123)
model = mh.fit(cdf)
cdf = model.transform(cdf)
cdf.select("NPI", "features", "hashes").show(5)

+----------+--------------------+--------------------+
|       NPI|            features|              hashes|
+----------+--------------------+--------------------+
|1679576722|(938,[0,2,37,362,...|[[-1.775142435E9]...|
|1588667638|(938,[0,2,5,17,31...|[[-1.775142435E9]...|
|1497758544|(938,[1,7,35,662]...|[[-6.7948028E8], ...|
|1306849450|(938,[0,2,5,51,46...|[[-1.775142435E9]...|
|1215930367|(938,[0,2,5,51,23...|[[-1.775142435E9]...|
+----------+--------------------+--------------------+
only showing top 5 rows



In [216]:
npi=1578547329
key = cdf.filter('NPI = {}'.format(npi)).select('NPI', 'features').collect()
# print(type(key[0][1])) # <class 'pyspark.ml.linalg.SparseVector'>
# print(key[0][1],'\n') # (6,[2,3,5],[1.0,1.0,1.0]) 

In [217]:
neighbors=10
nn = model.approxNearestNeighbors(cdf, key[0][1], neighbors+1, distCol='JaccardDistance').select('NPI').collect()
print('Top {} most similar to {}:'.format(neighbors, npi), [n[0] for n in nn[1:]])

Top 10 most similar to 1578547329: [1184602823, 1962408021, 1962408898, 1083675995, 1437103702, 1740243138, 1144273475, 1437113578, 1821049750, 1922082676]


In [None]:
# 10 hash tables

# Top 10 most similar to 1679576722: [1679576722, 1770586224, 1881686475, 1548219223, 1780634568, \
#                                     1194775270, 1104876366, 1336198001, 1336199561, 1376597120]

# Top 10 most similar to 1578547329: [1184602823, 1962408021, 1962408898, 1083675995, 1437103702, \
#                                     1740243138, 1144273475, 1437113578, 1821049750, 1922082676]

http://mccormickml.com/2015/06/12/minhash-tutorial-with-python-code/  
https://databricks.com/blog/2017/05/09/detecting-abuse-scale-locality-sensitive-hashing-uber-engineering.html  

In [None]:
# joining dataframes...
# df.join(code_df)

### MLlib Similarity Matrix
https://spark.apache.org/docs/2.1.1/api/python/pyspark.mllib.html#pyspark.mllib.linalg.distributed.RowMatrix

In [218]:
row_rdd = sc.parallelize([[1, 2], [1, 5]])
# RowMatrix(rdd, numRows=0, numCols=0)
mat = RowMatrix(row_rdd)
sims = mat.columnSimilarities()
sims.entries.first().value

0.9191450300180579

In [None]:
# len(features.take(1)[0]) # 938
features.top(2)

In [None]:
mat = RowMatrix(features, numRows=5205376, numCols=938)
sims = mat.columnSimilarities()
sims.entries.first().value

### UDF with jaccard metric

In [211]:
from scipy.spatial.distance import jaccard

In [206]:
def dist(x):
    return x-.2

# dist_udf = udf(dist, ArrayType(IntegerType()))
dist_udf = udf(dist, FloatType())

In [209]:
cdf = cdf.drop('new')
cdf = cdf.withColumn('new', dist_udf('Entity_1'))
cdf.select('new').show(5)

+----+
| new|
+----+
| 0.8|
| 0.8|
|-0.2|
| 0.8|
| 0.8|
+----+
only showing top 5 rows

