# Imports

In [1]:
from pyspark import SparkContext as sparkc
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import DataFrameReader

# Code

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [3]:
schema = StructType([
    StructField("START", StringType(), True),
    StructField("STOP", StringType(), True),
    StructField("PATIENT", StringType(), True),
    StructField("ENCOUNTER", StringType(), True),
    StructField("CODE", IntegerType(), True),
    StructField("DESCRIPTION", StringType(), True)])

df = spark.read.csv("./Assignment1/conditions.csv",header=True,schema=schema)
# df = spark.read.csv("Assignment1/conditionsample.csv",header=True,schema=schema).select('PATIENT','CODE')

df.printSchema()
df.show()

root
 |-- START: string (nullable = true)
 |-- STOP: string (nullable = true)
 |-- PATIENT: string (nullable = true)
 |-- ENCOUNTER: string (nullable = true)
 |-- CODE: integer (nullable = true)
 |-- DESCRIPTION: string (nullable = true)

+----------+----------+--------------------+--------------------+---------+--------------------+
|     START|      STOP|             PATIENT|           ENCOUNTER|     CODE|         DESCRIPTION|
+----------+----------+--------------------+--------------------+---------+--------------------+
|2017-01-14|2017-03-30|09e4e8cb-29c2-4ef...|88e540ab-a7d7-47d...| 65363002|        Otitis media|
|2012-09-15|2012-09-16|b0a03e8c-8d0f-424...|e89414dc-d0c6-478...|241929008|Acute allergic re...|
|2018-06-17|2018-06-24|09e4e8cb-29c2-4ef...|c14325b0-f7ec-431...|444814009|Viral sinusitis (...|
|2019-04-19|2019-09-26|09e4e8cb-29c2-4ef...|71af18ee-3157-408...| 65363002|        Otitis media|
|2019-04-27|2019-05-18|09e4e8cb-29c2-4ef...|411d4eae-72d1-478...|444814009|Viral s

## PATIENT - CODE

In [4]:
patients = df.select('PATIENT', 'CODE')
patients.show()

+--------------------+---------+
|             PATIENT|     CODE|
+--------------------+---------+
|09e4e8cb-29c2-4ef...| 65363002|
|b0a03e8c-8d0f-424...|241929008|
|09e4e8cb-29c2-4ef...|444814009|
|09e4e8cb-29c2-4ef...| 65363002|
|09e4e8cb-29c2-4ef...|444814009|
|09e4e8cb-29c2-4ef...| 33737001|
|b0a03e8c-8d0f-424...|444814009|
|b0a03e8c-8d0f-424...| 10509002|
|b0a03e8c-8d0f-424...|233678006|
|b0a03e8c-8d0f-424...|195662009|
|b0a03e8c-8d0f-424...|232353008|
|b0a03e8c-8d0f-424...|195662009|
|5420ae87-24c8-4ed...|446096008|
|5420ae87-24c8-4ed...|284551006|
|5420ae87-24c8-4ed...|283371005|
|5420ae87-24c8-4ed...| 72892002|
|5420ae87-24c8-4ed...|444814009|
|5420ae87-24c8-4ed...|195662009|
|bf1f30f2-27de-4b5...|162864005|
|bf1f30f2-27de-4b5...|283385000|
+--------------------+---------+
only showing top 20 rows



## CODE - DESCRIPTION (DISTINCT)

In [5]:
diagnosis = df.select('CODE', 'DESCRIPTION').distinct()
diagnosis.show()

+---------+--------------------+
|     CODE|         DESCRIPTION|
+---------+--------------------+
|446096008|Perennial allergi...|
| 88805009|Chronic congestiv...|
| 65363002|        Otitis media|
| 79586000|     Tubal pregnancy|
| 95417003|Primary fibromyal...|
| 74400008|        Appendicitis|
|262574004|        Bullet wound|
|236077008| Protracted diarrhea|
| 44054006|            Diabetes|
|443165006|Pathological frac...|
|444814009|Viral sinusitis (...|
|399211009|History of myocar...|
|241929008|Acute allergic re...|
|110030002|Concussion injury...|
|200936003| Lupus erythematosus|
|444470001|Injury of anterio...|
|307731004|Injury of tendon ...|
|196416002|     Impacted molars|
| 43878008|Streptococcal sor...|
|424132000|Non-small cell ca...|
+---------+--------------------+
only showing top 20 rows



## Functions

In [24]:
from itertools import islice

def take(n, iterable):
    return list(islice(iterable, n))

### Definir Itemset PATIENT-CODEs

In [6]:
codeGroupedByPatients = df.rdd.map(lambda x: (x.PATIENT, x.CODE)).groupByKey().mapValues(set)
codeGroupedByPatients.take(10)
# codeGroupedByPatients.collect()

[('28a3cdb7-1db1-4148-8280-8a4e5b4f99e0',
  {19169002, 72892002, 156073000, 284551006}),
 ('3826037f-19e0-4c7b-98e5-4e9578472f67',
  {24079001, 55822004, 65966004, 162864005}),
 ('e32e0069-2d3f-4b7b-b420-3269c94723ad', {16114001, 162864005, 195662009}),
 (None, {None}),
 ('887ad9bb-bd72-44cf-8e5e-8aff7fbdeed4',
  {40275004, 44465007, 72892002, 195662009, 444814009}),
 ('8e763f75-614b-4ef7-aa86-ce459dd3142e',
  {10509002, 70704007, 128613002, 195662009, 703151001}),
 ('8b0755cd-54d4-48e2-a163-4bf04e47f2f2',
  {15777000,
   36971009,
   38822007,
   40055000,
   59621000,
   195662009,
   271737000,
   444814009,
   446096008}),
 ('2593819d-f0ff-470b-95da-656e8340255c',
  {10509002,
   19169002,
   35999006,
   72892002,
   195662009,
   198992004,
   232353008,
   398254007,
   444814009}),
 ('de087296-4f63-40b4-94f0-fc0dd91df200',
  {10509002, 162864005, 195662009, 408512008, 444814009}),
 ('4a181f3d-0937-466a-a503-d449aea0dbfa', {70704007, 75498004})]

# Frequent Items Table

In [75]:
def frequent_items_table(itemCounts):
    frequentItems = list()
    supportThreshold = 1000
    for key, value in itemCounts.items():
        if key is None:
            continue
        if value < supportThreshold:
            frequentItems.append(key)
    return frequentItems

## Confidence - Interest

In [88]:
# def confidenceInterest(pairs, occurrences):
#     supportThreshold = 1000
# #     print("PAIR ", pairs)
#     for key, value in pairs.items():
#         print("PAIR ", key, " VALUE", value)
#         confidence = supportThreshold/value
# #         for item in pair:
# #         newItem = str(key.split(', '))
#         item = str(key)
#         newItem = item.split(', ')
# #         if confidence >= 0.9 and str(pair.key() + newItem[2]) in pairs:
#         if confidence >= 0.9 and str(key + newItem[2]) in pairs.items():
#             interest = confidence - occurrences[newItem[2]]/pairs.count()
#             if interest >= 0.9:
#                 print("Confidence: ", confidence, " Interest: ", interest)

In [106]:
def confidenceInterest(pairs, newItem, occurrences):
    supportThreshold = 1000
    for key, value in pairs.items():
        confidence = supportThreshold/value
        interest = confidence - occurrences[newItem]/len(pairs)
        if confidence >= 0.9 and interest >= 0.9:
#             print("Confidence: ", confidence, " Interest: ", interest)
            result = "%s - %s --- Confidence: %f Interest: %f" % (key, newItem, confidence, interest)
        print(result)

# A-Priori

In [90]:
from collections import defaultdict

def apriori(codePatients):
#     codePatient = codePatients.collect()
    codePatient = codePatients.take(500)
    item_counts, pair_counts = defaultdict(int),  defaultdict(int)
    for basket in codePatient:
        for item in basket[1]:
                item_counts[item] += 1
    return item_counts    

In [91]:
import collections
from operator import itemgetter

def aprioriSecondPass(codePatients, freqItems):
#     codePatient = codePatients.collect()
    codePatient = codePatients.take(500)
    pair_counts = defaultdict(int)
    for basket in codePatient:
        for i in basket[1]:
            if i not in freqItems:
                continue
            for j in basket[1]:
                if j in freqItems:
                    pair_counts[i, j] += 1
#     return take(10, sorted(pair_counts.items(), key=itemgetter(1), reverse=True))
    return pair_counts

In [95]:
def aprioriThirdPass(codePatients, freqItems, pairs, occurrences):
#     codePatient = codePatients.collect()
    codePatient = codePatients.take(500)
    triple_counts = defaultdict(int)
    for basket in codePatient:
        for i in basket[1]:
            if i not in freqItems:
                continue
            for j in basket[1]:
                if j not in freqItems:
                    continue
                for z in basket[1]:
                    if z in freqItems:
                        triple_counts[i, j, z] += 1
                    confidenceInterest(pairs, z, occurrences)
    return take(10, sorted(triple_counts.items(), key=itemgetter(1), reverse=True))

In [11]:
result = apriori(codeGroupedByPatients)
print(result.items())

dict_items([(156073000, 16), (72892002, 97), (19169002, 79), (284551006, 19), (24079001, 8), (55822004, 62), (65966004, 21), (162864005, 149), (16114001, 8), (195662009, 255), (None, 1), (44465007, 41), (444814009, 313), (40275004, 6), (70704007, 17), (128613002, 21), (10509002, 182), (703151001, 21), (36971009, 23), (446096008, 10), (15777000, 147), (59621000, 130), (271737000, 159), (38822007, 10), (40055000, 109), (198992004, 9), (232353008, 7), (398254007, 7), (35999006, 10), (408512008, 3), (75498004, 26), (239872002, 9), (263102004, 16), (230690007, 22), (126906006, 20), (92691004, 20), (283385000, 13), (84757009, 12), (65363002, 53), (43878008, 63), (449868002, 11), (90560007, 10), (87433001, 11), (82423001, 20), (83664006, 3), (233604007, 3), (230265002, 5), (399211009, 11), (68496003, 43), (53741008, 31), (713197008, 15), (363406005, 5), (22298006, 11), (58150001, 17), (370143000, 4), (428251008, 26), (74400008, 26), (185086009, 3), (370247008, 12), (307731004, 5), (1734006, 1

In [12]:
freq_items = frequent_items_table(result)
print(freq_items)

[156073000, 72892002, 19169002, 284551006, 24079001, 55822004, 65966004, 162864005, 16114001, 195662009, 44465007, 444814009, 40275004, 70704007, 128613002, 10509002, 703151001, 36971009, 446096008, 15777000, 59621000, 271737000, 38822007, 40055000, 198992004, 232353008, 398254007, 35999006, 408512008, 75498004, 239872002, 263102004, 230690007, 126906006, 92691004, 283385000, 84757009, 65363002, 43878008, 449868002, 90560007, 87433001, 82423001, 83664006, 233604007, 230265002, 399211009, 68496003, 53741008, 713197008, 363406005, 22298006, 58150001, 370143000, 428251008, 74400008, 185086009, 370247008, 307731004, 1734006, 241929008, 233678006, 403190006, 192127007, 49436004, 26929004, 359817006, 443165006, 254837009, 88805009, 64859006, 422034002, 44054006, 302870006, 237602007, 367498001, 127013003, 80394007, 283371005, 431855005, 239873007, 39848009, 403192003, 196416002, 55680006, 47693006, 431856006, 5602001, 62106007, 403191005, 410429000, 429007001, 33737001, 314994000, 79586000, 

In [93]:
second_pass = aprioriSecondPass(codeGroupedByPatients, freq_items)
# print(second_pass.items())
print(second_pass)

defaultdict(<class 'int'>, {(156073000, 156073000): 16, (156073000, 72892002): 16, (156073000, 19169002): 16, (156073000, 284551006): 2, (72892002, 156073000): 16, (72892002, 72892002): 97, (72892002, 19169002): 40, (72892002, 284551006): 5, (19169002, 156073000): 16, (19169002, 72892002): 40, (19169002, 19169002): 79, (19169002, 284551006): 4, (284551006, 156073000): 2, (284551006, 72892002): 5, (284551006, 19169002): 4, (284551006, 284551006): 19, (24079001, 24079001): 8, (24079001, 55822004): 1, (24079001, 65966004): 1, (24079001, 162864005): 2, (55822004, 24079001): 1, (55822004, 55822004): 62, (55822004, 65966004): 4, (55822004, 162864005): 31, (65966004, 24079001): 1, (65966004, 55822004): 4, (65966004, 65966004): 21, (65966004, 162864005): 10, (162864005, 24079001): 2, (162864005, 55822004): 31, (162864005, 65966004): 10, (162864005, 162864005): 149, (16114001, 16114001): 8, (16114001, 195662009): 3, (16114001, 162864005): 2, (195662009, 16114001): 3, (195662009, 195662009): 255

In [107]:
third_pass = aprioriThirdPass(codeGroupedByPatients, freq_items, second_pass, result)
# print(third_pass)

(156073000, 156073000) - 156073000 --- Confidence: 62.500000 Interest: 62.496952
(156073000, 72892002) - 156073000 --- Confidence: 62.500000 Interest: 62.496952
(156073000, 19169002) - 156073000 --- Confidence: 62.500000 Interest: 62.496952
(156073000, 284551006) - 156073000 --- Confidence: 500.000000 Interest: 499.996952
(72892002, 156073000) - 156073000 --- Confidence: 62.500000 Interest: 62.496952
(72892002, 72892002) - 156073000 --- Confidence: 10.309278 Interest: 10.306230
(72892002, 19169002) - 156073000 --- Confidence: 25.000000 Interest: 24.996952
(72892002, 284551006) - 156073000 --- Confidence: 200.000000 Interest: 199.996952
(19169002, 156073000) - 156073000 --- Confidence: 62.500000 Interest: 62.496952
(19169002, 72892002) - 156073000 --- Confidence: 25.000000 Interest: 24.996952
(19169002, 19169002) - 156073000 --- Confidence: 12.658228 Interest: 12.655180
(19169002, 284551006) - 156073000 --- Confidence: 250.000000 Interest: 249.996952
(284551006, 156073000) - 156073000 -

KeyboardInterrupt: 

In [99]:
# confidenceInterest(third_pass, result)

### TESTES

In [15]:
def freq_items(itemCounts):
    frequent_items = itemCounts.list(map(lambda x: x[1] < 1000).mapValues(x[0]))
    print(frequent_items)

In [None]:
# % first pass
f o r ( eac h b a s k e t )
f o r ( eac h i t em i i n b a s k e t )
i tem_co u n t s [ i ] += 1
# % create frequent item stable
f r e q u e n t _ i t em s = f r e q u e n t _ i t em s _ t a b l e ( i tem_co u n t s )
# % second pass
f o r ( eac h i t em i i n b a s k e t )
i f i no t i n f r e q u e n t _ i t e m s : c o n t i n u e
f o r ( eac h i t em j i n b a s k e t ) %w i t h j > i
i f j i n f r e q u e n t _ i t e m s
p a i r_ c o u n t s [ i , j ] += 1

In [None]:
# import pyspark

# conf = pyspark.SparkConf()
# sc = pyspark.SparkContext(conf=conf)

In [None]:
def tuples(x):
    return set(x)

In [None]:
test = df.rdd.map(lambda x: (x.PATIENT, x.CODE)).take(2)
print(test)

# for t in test:
#     print(t)

# for row in test:
#     print("{} has {}".format(
#         row["PATIENT"],
#         row["CODE"]))

In [None]:
test2 = df.rdd.map(lambda x: (x.PATIENT, x.CODE)).groupByKey().mapValues(set)
# test2.fullOuterJoin(test2).take(5)
# print(test2)
# test2.mapValues(set)
# test2.mapValues(lambda code: code).mapValues(set).take(5)
# test2.mapValues(lambda code: code.lower()).take(4)
test2.take(5)

# pair = test2.groupByKey().take(1)
# print("%s:%s" % (test2[0], ",".join([n for n in test2[1]])))

In [None]:
'''sc = sparkc(appName="Assignemnt1-2_conditions")
textfile = sc.textFile('./Assignment1/conditionsample.csv')'''

In [None]:
'''spark.read.format('csv').options(header='true', inferSchema='true')
    .load('zipcodes.csv')'''

In [None]:
val schema = new StructType()
    .add("START",IntegerType,true)
    .add("STOP",IntegerType,true)
    .add("PATIENT",StringType,true)
    .add("ENCOUNTER",StringType,true)
    .add("CODE",StringType,true)
    .add("DESCRIPTION",StringType,true)

val df_with_schema = spark.read.format("csv")
    .option("header", "true")
    .schema(schema)
    .load("src/main/resources/zipcodes.csv")
df_with_schema.printSchema()
df_with_schema.show(false)

In [None]:
df = textfile.split(',')
type(df)

In [None]:
textfile.count()