# Download	the	Data

In [1]:
# Importar pacote de leitura de dados
import pandas as pd
#pd.set_option("display.max_columns", 200)

## From Heart Disease

In [2]:
# Ler o Dataset
df = pd.read_csv("https://raw.githubusercontent.com/laderast/cvdNight1/master/data/fullPatientData.csv")

In [3]:
df.head()

Unnamed: 0,patientID,age,htn,treat,smoking,race,t2d,gender,numAge,bmi,tchol,sbp,cvd
0,HHUID00519967,70-90,N,N,N,White,N,M,83,17,156,113,N
1,HHUID00379006,40-55,N,N,N,White,N,M,44,23,167,126,N
2,HHUID00357476,55-70,Y,Y,N,White,N,M,62,16,214,175,N
3,HHUID00862369,40-55,N,N,N,White,N,M,42,22,158,102,N
4,HHUID00481496,55-70,N,N,N,White,N,M,63,24,244,132,N


In [4]:
df.shape

(446203, 13)

## Separate Train/Test

In [5]:
cvd_train = df.sample(n=10000, random_state=42)
cvd_test = df.sample(n=2000, random_state=0)

In [6]:
cvd_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 357685 to 41414
Data columns (total 13 columns):
patientID    10000 non-null object
age          10000 non-null object
htn          10000 non-null object
treat        10000 non-null object
smoking      10000 non-null object
race         10000 non-null object
t2d          10000 non-null object
gender       10000 non-null object
numAge       10000 non-null int64
bmi          10000 non-null int64
tchol        10000 non-null int64
sbp          10000 non-null int64
cvd          10000 non-null object
dtypes: int64(4), object(9)
memory usage: 1.1+ MB


In [7]:
cvd_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 36027 to 265658
Data columns (total 13 columns):
patientID    2000 non-null object
age          2000 non-null object
htn          2000 non-null object
treat        2000 non-null object
smoking      2000 non-null object
race         2000 non-null object
t2d          2000 non-null object
gender       2000 non-null object
numAge       2000 non-null int64
bmi          2000 non-null int64
tchol        2000 non-null int64
sbp          2000 non-null int64
cvd          2000 non-null object
dtypes: int64(4), object(9)
memory usage: 218.8+ KB


## Dataset information

- 'patientID' = Patient Identifier
- 'age' = Patient Age Category
- 'htn' = Patient Hypertension
- 'treat' = Hypertension Treatment
- 'smoking' = Smoking Status
- 'race' = Race
- 't2d' = Patient Gender
- 'gender' = Type 2 Diabetes Status
- 'numAge' = Numerical Age
- 'bmi' = Body Mass Index
- 'tchol' = Total Cholesterol
- 'sbp' = Systolic Blood Pressure
- 'cvd' = Cardiovascular Disease Status

## Classification model

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
modelo = RandomForestClassifier(n_estimators=100, n_jobs=1, random_state=0)

### Prepare data

In [10]:
cvd_train.head()

Unnamed: 0,patientID,age,htn,treat,smoking,race,t2d,gender,numAge,bmi,tchol,sbp,cvd
357685,HHUID00563164,40-55,N,N,N,Asian/PI,N,F,45,21,227,115,N
331129,HHUID00274908,20-40,N,N,N,White,N,F,36,19,193,108,N
89436,HHUID00039383,20-40,N,N,N,White,N,M,24,16,193,110,N
132346,HHUID00597428,70-90,N,N,Y,White,N,M,70,15,156,124,N
102706,HHUID00426556,40-55,N,N,N,White,N,M,49,24,210,108,N


## Transform Train Data

In [11]:
def transform_form(valor):
  if valor == 'Y':
    return 1
  else:
    return 0  

In [12]:
def transform_gender(valor):
  if valor == 'F':
    return 1
  else:
    return 0  

In [13]:
cvd_train['htn_bin'] = cvd_train['htn'].map(transform_form) 
cvd_train['treat_bin'] = cvd_train['treat'].map(transform_form) 
cvd_train['smoking_bin'] = cvd_train['smoking'].map(transform_form) 
cvd_train['t2d_bin'] = cvd_train['t2d'].map(transform_form) 
cvd_train['cvd_bin'] = cvd_train['cvd'].map(transform_form) 
cvd_train['sexo_bin'] = cvd_train['gender'].map(transform_gender) 

### After transformation

In [14]:
cvd_train.head()

Unnamed: 0,patientID,age,htn,treat,smoking,race,t2d,gender,numAge,bmi,tchol,sbp,cvd,htn_bin,treat_bin,smoking_bin,t2d_bin,cvd_bin,sexo_bin
357685,HHUID00563164,40-55,N,N,N,Asian/PI,N,F,45,21,227,115,N,0,0,0,0,0,1
331129,HHUID00274908,20-40,N,N,N,White,N,F,36,19,193,108,N,0,0,0,0,0,1
89436,HHUID00039383,20-40,N,N,N,White,N,M,24,16,193,110,N,0,0,0,0,0,0
132346,HHUID00597428,70-90,N,N,Y,White,N,M,70,15,156,124,N,0,0,1,0,0,0
102706,HHUID00426556,40-55,N,N,N,White,N,M,49,24,210,108,N,0,0,0,0,0,0


## Variables to run in model

In [15]:
variaveis = ['htn_bin', 'treat_bin', 'smoking_bin', 't2d_bin', 'sexo_bin']

In [16]:
X = cvd_train[variaveis]
y = cvd_train['cvd_bin']

In [17]:
X.head()

Unnamed: 0,htn_bin,treat_bin,smoking_bin,t2d_bin,sexo_bin
357685,0,0,0,0,1
331129,0,0,0,0,1
89436,0,0,0,0,0
132346,0,0,1,0,0
102706,0,0,0,0,0


In [18]:
y.head()

357685    0
331129    0
89436     0
132346    0
102706    0
Name: cvd_bin, dtype: int64

### Train model

In [19]:
modelo.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

## Transform Test Data

In [20]:
cvd_test['htn_bin'] = cvd_test['htn'].map(transform_form) 
cvd_test['treat_bin'] = cvd_test['treat'].map(transform_form) 
cvd_test['smoking_bin'] = cvd_test['smoking'].map(transform_form) 
cvd_test['t2d_bin'] = cvd_test['t2d'].map(transform_form) 
cvd_test['cvd_bin'] = cvd_test['cvd'].map(transform_form) 
cvd_test['sexo_bin'] = cvd_test['gender'].map(transform_gender) 

### Preview data

In [21]:
X_prev = cvd_test[variaveis]
y_prev = cvd_test['cvd_bin']

### Predict

In [22]:
p = modelo.predict(X_prev) #ou X_test

In [23]:
p

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## Predict_proba

In [24]:
pb = modelo.predict_proba(X_prev) #ou X_test

In [25]:
pb

array([[0.92148037, 0.07851963],
       [0.97779744, 0.02220256],
       [0.95413804, 0.04586196],
       ...,
       [0.97779744, 0.02220256],
       [0.77196666, 0.22803334],
       [0.86928422, 0.13071578]])

## In Excel

In [26]:
sub = pd.Series(p, index=cvd_test['patientID'], name='cvd_bin')

In [27]:
sub.to_csv('primeiro.csv', header=True)

In [28]:
#!head -n10 primeiro.csv
sub.head()

patientID
HHUID00209262    0
HHUID00618715    0
HHUID00077486    0
HHUID00791386    0
HHUID00468893    0
Name: cvd_bin, dtype: int64

### Score

In [29]:
modelo.score(X_prev, y_prev)

0.91

### Evaluate

In [30]:
# Avaliar o classificador com base no target teste
from sklearn.metrics import classification_report
print(classification_report(y_prev, p))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1821
           1       0.40      0.01      0.02       179

    accuracy                           0.91      2000
   macro avg       0.66      0.50      0.49      2000
weighted avg       0.87      0.91      0.87      2000



In [32]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_prev, p))

0.504762868949776


## Numerical Data for Regression

In [None]:
cvd_num = ['numAge', 'bmi', 'tchol', 'sbp']

In [35]:
X = cvd_train[['sbp', 'bmi', 'tchol']]
y = cvd_train[['numAge']]

## Regression Model

In [43]:
from sklearn.linear_model import LinearRegression
modelo = LinearRegression()

### Run model

In [44]:
# Train the model 
modelo.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Evaluate model

In [45]:
modelo.score(X, y)

0.10301545857749117

In [46]:
modelo.coef_

array([[ 0.2233875 , -0.04266257, -0.00062459]])

In [47]:
modelo.intercept_

array([15.15103195])

In [49]:
X_new = [[45]] 
print(modelo.predict(X_new))

ValueError: shapes (1,1) and (3,1) not aligned: 1 (dim 1) != 3 (dim 0)