# Discretization

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("par_disease_copy.csv")
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [5]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,4238.0,4238.0,4133.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,1.97895,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,1.019791,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


## Discretization glucose

In [6]:
df["glucose"].describe()

count    3850.000000
mean       81.966753
std        23.959998
min        40.000000
25%        71.000000
50%        78.000000
75%        87.000000
max       394.000000
Name: glucose, dtype: float64

In [7]:
df["glucose_state"] = np.select([df["glucose"] < df["glucose"].quantile(0.5), df["glucose"] >= df["glucose"].quantile(0.5)],["low", "high"])

In [10]:
df["glucose_state"].value_counts()

glucose_state
high    1984
low     1866
0        388
Name: count, dtype: int64

0's are from NaN values in certain rows.

In [13]:
df.dropna(inplace = True)

In [14]:
df["glucose_state"].value_counts()

glucose_state
high    1868
low     1788
Name: count, dtype: int64

## Discretization pressure measure

In [17]:
df["diaBP"].describe()

count    3656.000000
mean       82.912062
std        11.974825
min        48.000000
25%        75.000000
50%        82.000000
75%        90.000000
max       142.500000
Name: diaBP, dtype: float64

In [15]:
df["sysBP"].describe()

count    3656.000000
mean      132.368025
std        22.092444
min        83.500000
25%       117.000000
50%       128.000000
75%       144.000000
max       295.000000
Name: sysBP, dtype: float64

In [18]:
condition = [((df["sysBP"] < df["sysBP"].quantile(0.25)) & (df["diaBP"] < df["diaBP"].quantile(0.25))),
             ((df["sysBP"] > df["sysBP"].quantile(0.75)) & (df["diaBP"] > df["diaBP"].quantile(0.75))),
             ((df["sysBP"] > df["sysBP"].quantile(0.5)) & (df["diaBP"] < df["diaBP"].quantile(0.5))),
             ((df["sysBP"] < df["sysBP"].quantile(0.5)) & (df["diaBP"] > df["diaBP"].quantile(0.5)))
            ]

In [19]:
result = ["low & low", "high & high", "high & low", "low & high"]

In [20]:
df["pressure_measure"] = np.select(condition, result)

In [22]:
df["pressure_measure"].value_counts()

pressure_measure
0              1833
low & low       607
high & high     580
high & low      345
low & high      291
Name: count, dtype: int64

A lot of zeros, as the conditions doesn't covers all the dataset.

## Discretization totChol

In [23]:
df["totChol"].describe()

count    3656.000000
mean      236.873085
std        44.096223
min       113.000000
25%       206.000000
50%       234.000000
75%       263.250000
max       600.000000
Name: totChol, dtype: float64

In [24]:
def totChol_discretization(value):
    if value < df["totChol"].quantile(0.25):
        return "low"
    elif value > df["totChol"].quantile(0.75):
        return "high"
    else:
        return "medium"

In [25]:
df["Chol_state"] = df["totChol"].apply(totChol_discretization)

In [28]:
df["Chol_state"].value_counts()

Chol_state
medium    1843
high       914
low        899
Name: count, dtype: int64