# Overview
* prep data for models

# Dependnecies

In [19]:
# data
import numpy as np
import pandas as pd
import pyarrow as pa

# preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# training
from sklearn.model_selection import train_test_split

# Get Original Data

In [2]:
df_orig = pd.read_parquet("./data/diabetes.parquet")
df_orig.head()

Unnamed: 0,income,race,state,age,sex,height,weight,general_health,doctor,medical_costs,checkup,exercise,marital,education,smoking,alcohol,diabetes,BMI,year
0,10k-15k,white,AL,65-69,female,63.0,263.0,good,yes,no,1 year,no,single,12/ged,no,0,no,obese class III,2006
1,<10k,white,AL,55-59,male,75.0,290.0,poor,yes,yes,2 years,no,married,1-8,no,0,no,obese class II,2006
2,35k-50k,black,AL,40-44,male,71.0,230.0,very good,yes,no,1 year,yes,married,12/ged,no,0,no,obese class I,2006
3,10k-15k,black,AL,35-39,male,75.0,320.0,very good,no,yes,5 years,yes,single,12/ged,no,0,no,obese class II,2006
4,50k-75k,white,AL,50-54,female,64.0,120.0,excellent,yes,no,2 years,yes,married,cg,no,2,no,normal,2006


In [3]:
len(df_orig.index)

5941780

# Preprocessing
* Prepping Data for Logistic Regression<br>
https://medium.com/@veramiler/the-logistic-regression-in-python-how-to-prepare-a-data-and-find-the-best-model-a85a6563cf96
* Yes/No to 1/0<br>
https://stackoverflow.com/questions/40901770/is-there-a-simple-way-to-change-a-column-of-yes-no-to-1-0-in-a-pandas-dataframe

## Scale

In [4]:
df_scaled = df_orig.loc[:, ['height', 'weight', 'alcohol']]
scaler = MinMaxScaler()
arr_scaled = scaler.fit_transform(df_scaled)

## Yes = 1, No = 0

In [5]:
arr_yesno = df_orig.loc[:, ['doctor', 'medical_costs', 'exercise', 'smoking']].to_numpy()
arr_yesno = np.where(arr_yesno == "yes", 1, 0)

In [6]:
arr_diabetes_actual = df_orig.loc[:, 'diabetes'].to_numpy()
arr_diabetes_actual = np.where(arr_diabetes_actual == "yes", 1, 0)

In [7]:
arr_diabetes_actual.shape

(5941780,)

In [8]:
arr_diabetes_actual

array([0, 0, 0, ..., 0, 0, 1])

## Label Encode

In [9]:
# df_orig['sex'].unique()

In [10]:
# df_orig.groupby(['year', 'sex']).size()

In [11]:
# dicts

dict_inc = {
    '<10k': 0,
    '10k-15k': 1,
    '15k-20k': 2,
    '20k-25k': 3,
    '25k-35k': 4,
    '35k-50k': 5, 
    '50k-75k': 6, 
    '>75k': 7 
}

dict_age = {
    '18-24': 0,
    '25-29': 1,
    '30-34': 2,
    '35-39': 3,
    '40-44': 4,
    '45-49': 5,
    '50-54': 6,
    '55-59': 7,
    '60-64': 8,
    '65-69': 9,
    '70-74': 10,
    '75-79': 11,
    '80+': 12 
}

dict_gh = {
    'poor': 0,
    'fair': 1,
    'good': 2,
    'very good': 3,
    'excellent': 4
}

dict_cu = {
    '1 year': 1,
    '2 years': 2,
    '3 years': 3,
    '4 years': 4,
    '5 years': 5,
    '>5 years': 6,
    'never': 100,
    'unknown': 100
}

dict_bmi = {
    'very severely underweight': 0, 
    'severely underweight': 1,
    'underweight': 2, 
    'normal': 3, 
    'overweight': 4, 
    'obese class I': 5, 
    'obese class II': 6, 
    'obese class III': 7
}

dict_sex = {
    'female': 0,
    'male': 1
}

dict_ed = {
    'none': 0,
    '1-8': 1, 
    '9-11': 2,
    '12/ged': 3,
    'c1-3': 4,
    'cg': 5
}

In [12]:
# replace

dict_cols = {
    'income': dict_inc, 
    'age': dict_age, 
    'general_health': dict_gh, 
    'checkup': dict_cu, 
    'BMI': dict_bmi,
    'sex': dict_sex,
    'education': dict_ed
}

ls_replaced = []
for col, dict_map in dict_cols.items():
    # print(col)
    ls_replaced.append([dict_map[cat] for cat in df_orig[col]])

In [13]:
arr_cat = np.array(ls_replaced).transpose()

## One Hot Encode

In [14]:
arr_ohe = pd.get_dummies(df_orig[['race', 'state', 'marital']], drop_first=True).to_numpy()

## Combined

In [15]:
arr_diabetes = np.concatenate((arr_scaled, arr_yesno, arr_cat, arr_ohe), axis=1)

In [16]:
arr_diabetes.shape

(5941780, 78)

# Split

In [20]:
%%time
x_train, x_tv, y_train, y_tv = train_test_split(arr_diabetes, arr_diabetes_actual, test_size=0.2, random_state=0)
x_test, x_validate, y_test, y_validate = train_test_split(x_tv, y_tv, test_size=0.5, random_state=0)

Wall time: 49.5 s


# Save

## npy

In [21]:
dict_save_npy = {
    "arr_diabetes.npy": arr_diabetes,  # 3.45 GB
    "arr_diabetes_actual.npy": arr_diabetes_actual,
    "x_train.npy": x_train,
    "y_train.npy": y_train,
    "x_test.npy": x_test,
    "y_test.npy": y_test,
    "x_validate.npy": x_validate,
    "y_validate.npy": y_validate
}

In [22]:
output_dir = "./data/"
for filename, arr in dict_save_npy.items():
    with open(output_dir + filename, 'wb') as f:
        np.save(f, arr)

## parquet

In [None]:
# dict_data = {}
# for col in range(arr_diabetes.shape[1]):
#     dict_data[col] = arr_diabetes[:, col]

In [None]:
# pa_table = pa.table(dict_data)
# pa.parquet.write_table(pa_table, "arr_diabetes.parquet")