# Workflow - Sulamérica Challenge - ML


## TRATAMENTO DE DADOS

In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
allergies = pd.read_csv('data/csv/allergies.csv')
patients = pd.read_csv('data/csv/patients.csv')
careplans = pd.read_csv('data/csv/careplans.csv')
conditions = pd.read_csv('data/csv/conditions.csv')
devices = pd.read_csv('data/csv/devices.csv')
encounters = pd.read_csv('data/csv/encounters.csv')
imaging_studies = pd.read_csv('data/csv/imaging_studies.csv')
immunizations = pd.read_csv('data/csv/immunizations.csv')
medications = pd.read_csv('data/csv/medications.csv')
observations = pd.read_csv('data/csv/observations.csv')
organizations = pd.read_csv('data/csv/organizations.csv')
payers = pd.read_csv('data/csv/payers.csv')
payer_transitions = pd.read_csv('data/csv/payer_transitions.csv')
procedures = pd.read_csv('data/csv/procedures.csv')
providers = pd.read_csv('data/csv/providers.csv')
supplies = pd.read_csv('data/csv/supplies.csv')

In [3]:
patients = patients.rename(columns={'Id': 'PATIENT'})
careplans = careplans.rename(columns={'Id': 'CAREPLAN_ID'})
encounters = encounters.rename(columns={'Id': 'ENCOUNTER'})
observations = observations.rename(columns={'CODE': 'OBSERVATIONS_CODE'})
observations = observations.rename(columns={'DESCRIPTION': 'OBSERVATIONS_DESCRIPTION'})
imaging_studies = imaging_studies.rename(columns={'Id': 'IMAGING_STUDIES_ID'})

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
all_data = pd.merge(
    patients,
    encounters,
    how="left",
    on=['PATIENT'],
    copy=True,
    indicator=False,
    validate=None)

In [6]:
all_data[all_data['PATIENT'].isin(['37d96a0f-ead9-7a28-1c27-42a2da0c0c97'])].head(3)

Unnamed: 0,PATIENT,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,ENCOUNTER,START,STOP,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
668,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,94853162-008b-f579-00be-f15d22fb6f7a,1991-07-12T16:50:15Z,1991-07-12T17:05:15Z,2c5d106d-836c-35fe-96c1-36797e2319ac,8cb795bb-fc03-3237-b02f-14c01f71ef32,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,162673000,General examination of patient (procedure),129.16,129.16,0.0,,
669,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,21475337-cc5f-0450-0a7c-c01d6046cf8e,1993-10-08T16:50:15Z,1993-10-08T17:05:15Z,c44f361c-2efb-3050-8f97-0354a12e2920,d94bbaea-9355-379d-8978-e3cb17ae0d40,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,outpatient,185349003,Encounter for check up (procedure),77.49,77.49,0.0,,
670,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,3863a43e-6037-a683-3150-53c8775410be,1993-10-08T16:50:15Z,1993-10-08T17:05:15Z,226098a2-6a40-3588-b5bb-db56c3a30a04,59ae271f-d96e-3768-8b93-a3953f55451a,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,185347001,Encounter for problem,77.49,77.49,0.0,271737000.0,Anemia (disorder)


In [7]:
procedures[procedures['PATIENT'].isin(['37d96a0f-ead9-7a28-1c27-42a2da0c0c97'])].head(3)

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
756,2012-05-18T16:50:15Z,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,affa320f-bed0-abe1-799c-fe25a24f750a,430193006,Medication Reconciliation (procedure),639.85,,
757,2014-01-03T16:50:15Z,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,c836be3a-1bfe-94cf-9700-ac6543ab53c8,430193006,Medication Reconciliation (procedure),700.43,,
758,2014-05-23T16:50:15Z,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,27595d37-7cf7-4f5b-4c5b-b82209b61c16,430193006,Medication Reconciliation (procedure),939.37,,


In [8]:
all_data = pd.merge(
    all_data,
    procedures,
    how="left",
    on=['PATIENT', 'ENCOUNTER'],
    copy=True,
    indicator=False,
    validate=None)

In [9]:
all_data[all_data['PATIENT'].isin(['37d96a0f-ead9-7a28-1c27-42a2da0c0c97'])].head(3)

Unnamed: 0,PATIENT,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,MAIDEN,MARITAL,RACE,ETHNICITY,GENDER,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,ENCOUNTER,START,STOP,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE_x,DESCRIPTION_x,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE_x,REASONDESCRIPTION_x,DATE,CODE_y,DESCRIPTION_y,BASE_COST,REASONCODE_y,REASONDESCRIPTION_y
1140,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,94853162-008b-f579-00be-f15d22fb6f7a,1991-07-12T16:50:15Z,1991-07-12T17:05:15Z,2c5d106d-836c-35fe-96c1-36797e2319ac,8cb795bb-fc03-3237-b02f-14c01f71ef32,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,162673000,General examination of patient (procedure),129.16,129.16,0.0,,,,,,,,
1141,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,21475337-cc5f-0450-0a7c-c01d6046cf8e,1993-10-08T16:50:15Z,1993-10-08T17:05:15Z,c44f361c-2efb-3050-8f97-0354a12e2920,d94bbaea-9355-379d-8978-e3cb17ae0d40,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,outpatient,185349003,Encounter for check up (procedure),77.49,77.49,0.0,,,,,,,,
1142,37d96a0f-ead9-7a28-1c27-42a2da0c0c97,1972-05-12,,999-60-5458,S99954850,X44639653X,Mr.,Kirby843,Lueilwitz711,,,M,white,nonhispanic,M,Holbrook Massachusetts US,655 Reichert Estate Apt 2,Whitman,Massachusetts,Plymouth County,,42.047835,-70.955601,28973.06,294.96,3863a43e-6037-a683-3150-53c8775410be,1993-10-08T16:50:15Z,1993-10-08T17:05:15Z,226098a2-6a40-3588-b5bb-db56c3a30a04,59ae271f-d96e-3768-8b93-a3953f55451a,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,ambulatory,185347001,Encounter for problem,77.49,77.49,0.0,271737000.0,Anemia (disorder),,,,,,


In [10]:
observations.head(3)

Unnamed: 0,DATE,PATIENT,ENCOUNTER,OBSERVATIONS_CODE,OBSERVATIONS_DESCRIPTION,VALUE,UNITS,TYPE
0,2011-04-17T07:18:08Z,a3795ec8-54f3-e99e-a4b1-4c067f3141d7,cdfce94c-98f2-5331-6868-b86da30ee994,8302-2,Body Height,122.2,cm,numeric
1,2011-03-29T01:52:13Z,a074203a-4773-9330-fc6a-06307ed6b3d7,76e82eab-c1ce-1677-9f8c-0bb5c07d3328,8302-2,Body Height,166.9,cm,numeric
2,2011-04-17T07:18:08Z,a3795ec8-54f3-e99e-a4b1-4c067f3141d7,cdfce94c-98f2-5331-6868-b86da30ee994,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,2.0,{score},numeric


In [11]:
all_data = pd.merge(
    all_data,
    allergies,
    how="left",
    on=['PATIENT', 'ENCOUNTER'],
    copy=True,
    indicator=False,
    validate=None)

In [12]:
all_data = pd.merge(
    all_data,
    medications,
    how="left",
    on=['PATIENT', 'ENCOUNTER'],
    copy=True,
    indicator=False,
    validate=None)

In [13]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

In [14]:
columns_bug = []

In [15]:
for i in all_data.columns:
    try:
        all_data[i] = labelencoder.fit_transform(all_data[i].astype(str))
    except:
        if i not in columns_bug:
            all_data.drop(i, axis=1, inplace=True)
            columns_bug.append(i)
        else:
            continue

## CRIANDO O MODEL

In [16]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [18]:
# Create X (features matrix or data or feature variables)
x = all_data.drop('REASONDESCRIPTION', axis=1)

# Create Y (labels)
y = all_data['REASONDESCRIPTION']

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [20]:
clf.fit(x_train, y_train)

RandomForestClassifier()

In [21]:
clf.score(x_train, y_train)

1.0

In [22]:
clf.score(x_test, y_test)

0.9995673180933151

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [25]:
y_preds = clf.predict(x_test)

In [26]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.99      1.00      1.00       542
           2       1.00      1.00      1.00      1274
           3       1.00      1.00      1.00      1252
           4       1.00      0.88      0.93         8
           5       1.00      0.89      0.94        53
           6       1.00      1.00      1.00       860
           7       0.99      0.99      0.99       202
           8       1.00      0.93      0.97        15
           9       1.00      1.00      1.00       130
          10       1.00      1.00      1.00       386
          11       1.00      0.93      0.97        15
          12       0.99      1.00      1.00      4601
          13       1.00      1.00      1.00       756
          14       1.00      0.98      0.99       490
          15       0.91      0.95      0.93        21
          16       1.00      1.00      1.00         2
          17       1.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
accuracy_score(y_test, y_preds)

0.9995673180933151

# SALVAR MODEL

In [28]:
# SALVAR MODEL
import pickle

pickle.dump(clf, open('random_forest_model_1.pk1', 'wb'))