# Import the libraries

In [44]:
import os
import sys
sys.path.append("../")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [45]:
# Load data

data = pd.read_csv('../data/raw/smoking_driking_dataset_Ver01.csv')

In [46]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,991346.0,47.614491,14.181339,20.0,35.0,45.0,60.0,85.0
height,991346.0,162.240625,9.282957,130.0,155.0,160.0,170.0,190.0
weight,991346.0,63.28405,12.514241,25.0,55.0,60.0,70.0,140.0
waistline,991346.0,81.233358,11.850323,8.0,74.1,81.0,87.8,999.0
sight_left,991346.0,0.980834,0.605949,0.1,0.7,1.0,1.2,9.9
sight_right,991346.0,0.978429,0.604774,0.1,0.7,1.0,1.2,9.9
hear_left,991346.0,1.031495,0.17465,1.0,1.0,1.0,1.0,2.0
hear_right,991346.0,1.030476,0.171892,1.0,1.0,1.0,1.0,2.0
SBP,991346.0,122.432498,14.543148,67.0,112.0,120.0,131.0,273.0
DBP,991346.0,76.052627,9.889365,32.0,70.0,76.0,82.0,185.0


In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991346 entries, 0 to 991345
Data columns (total 24 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   sex               991346 non-null  object 
 1   age               991346 non-null  int64  
 2   height            991346 non-null  int64  
 3   weight            991346 non-null  int64  
 4   waistline         991346 non-null  float64
 5   sight_left        991346 non-null  float64
 6   sight_right       991346 non-null  float64
 7   hear_left         991346 non-null  float64
 8   hear_right        991346 non-null  float64
 9   SBP               991346 non-null  float64
 10  DBP               991346 non-null  float64
 11  BLDS              991346 non-null  float64
 12  tot_chole         991346 non-null  float64
 13  HDL_chole         991346 non-null  float64
 14  LDL_chole         991346 non-null  float64
 15  triglyceride      991346 non-null  float64
 16  hemoglobin        99

In [48]:
#Rename columns

new_name = {
    'SBP': 'systolic_blood_pressure',
    'DBP': 'diastolic_blood_pressure',
    'BLDS': 'glucose_fasting',
    'tot_chole': 'total_cholesterol',
    'HDL_chole': 'HDL',
    'LDL_chole': 'LDL',
    'SGOT_AST': 'AST',
    'SGOT_ALT': 'ALT',
    'gamma_GTP': 'GGT',
    'DRK_YN': 'drinker',
    'SMK_stat_type_cd': 'smoker_status',
}

data.rename(columns=new_name, inplace=True)

# Inconsistency data:
## Variables with inconsistent data:
- Waistline : values inconsistent.
- Sight_left: range evaluation is 0.1 to 2.0, but there are values greater than 2.0.
- Sight_right: range evaluation is 0.1 to 2.0, but there are values greater than 2.0.
- Glucose_Fasting: range evaluation are incompatible with life.
- Cholesterol: range evaluation are incompatible.
- HDL: range evaluation are incompatible.
- LDL: range evaluation are incompatible.
- Triglycerides: range evaluation are incompatible with variables HDL, LDL and Cholesterol.
- Hemoglobin: range evaluation are incompatible with life.
- Serum_Creatinine: range evaluation are incompatible with life.
- GOT: range evaluation are incompatible with life.
- GPT: range evaluation are incompatible with life.
- GGT: range evaluation are incompatible with life.

In [49]:
# drop inconsistent data
data.drop(data[data['waistline'] >= 150].index, inplace=True)
data.drop(data[data['waistline'] == 8].index, inplace=True)
data.drop(data[data['glucose_fasting'] <= 40].index, inplace=True)
data.drop(data[~(data['HDL'] <= data['total_cholesterol'])].index, inplace=True)
data.drop(data[data['triglyceride'] <= 10].index, inplace=True)
data.drop(data[data['triglyceride'] > 500].index, inplace=True)
data.drop(data[data['total_cholesterol'] > 500].index, inplace=True)
data.drop(data[data['HDL'] <= 10].index, inplace=True)
data.drop(data[data['hemoglobin'] < 4].index, inplace=True)
data.drop(data[data['serum_creatinine'] > 10].index, inplace=True)
data.drop(data[data['AST'] < 3].index, inplace=True)
data.drop(data[data['AST'] > 500].index, inplace=True)
data.drop(data[data['ALT'] < 3].index, inplace=True)
data.drop(data[data['ALT'] > 500].index, inplace=True)
data.drop(data[data['GGT'] < 3].index, inplace=True)
data.drop(data[data['GGT'] > 500].index, inplace=True)

# Drop columns

data.drop('LDL', axis=1, inplace=True)
data.drop('sight_left', axis=1, inplace=True)
data.drop('sight_right', axis=1, inplace=True)
data.drop('hear_left', axis=1, inplace=True)
data.drop('hear_right', axis=1, inplace=True)

## Explanation of the data cleaning process:
- The data eliminated were those whose values were disproportionate to the range of the variable. The intention was to maintain pathological values that are considered outliers so that the model would be able to generalize the data.
- The LDL variable was eliminated to create a new variable that is composed of the difference between the values of Cholesterol and HDL.

## Create new variables:

In [50]:
# Add  new variable NO_HDL 
data['NO_HDL'] = data['total_cholesterol'] - data['HDL']

# add new variable for AST_ALT
data['AST_ALT'] = data['AST'] / data['ALT']

# add new variable BMI
data['BMI'] = data['weight'] / ((data['height'] / 100) ** 2)

# add new variable for hypertension
data['hypertension'] = data.apply(lambda x: 1 if (x['systolic_blood_pressure'] > 130) & (x['diastolic_blood_pressure'] > 80) else 0, axis=1)

# add new variable for diabetes
data['diabetes'] = data.apply(lambda x: 1 if x['glucose_fasting'] >= 126 else 0, axis=1)

## Transform variables:

In [51]:
# transform drinker and sex to binary 

data['drinker'] = data['drinker'].apply(lambda x: 1 if x == 'Y' else 0)
data['sex'] = data['sex'].apply(lambda x: 1 if x == 'Male' else 0)
data['urine_protein'] = data['urine_protein'].apply(lambda x: 1 if x > 1.0 else 0)

Now the dataset no have inconsistent data.

In [52]:
data.columns

Index(['sex', 'age', 'height', 'weight', 'waistline',
       'systolic_blood_pressure', 'diastolic_blood_pressure',
       'glucose_fasting', 'total_cholesterol', 'HDL', 'triglyceride',
       'hemoglobin', 'urine_protein', 'serum_creatinine', 'AST', 'ALT', 'GGT',
       'smoker_status', 'drinker', 'NO_HDL', 'AST_ALT', 'BMI', 'hypertension',
       'diabetes'],
      dtype='object')

In [53]:
data.dtypes

sex                           int64
age                           int64
height                        int64
weight                        int64
waistline                   float64
systolic_blood_pressure     float64
diastolic_blood_pressure    float64
glucose_fasting             float64
total_cholesterol           float64
HDL                         float64
triglyceride                float64
hemoglobin                  float64
urine_protein                 int64
serum_creatinine            float64
AST                         float64
ALT                         float64
GGT                         float64
smoker_status               float64
drinker                       int64
NO_HDL                      float64
AST_ALT                     float64
BMI                         float64
hypertension                  int64
diabetes                      int64
dtype: object

Now the variables are with the correct data type.

In [54]:
# Detect missing values
data.isna().sum()

sex                         0
age                         0
height                      0
weight                      0
waistline                   0
systolic_blood_pressure     0
diastolic_blood_pressure    0
glucose_fasting             0
total_cholesterol           0
HDL                         0
triglyceride                0
hemoglobin                  0
urine_protein               0
serum_creatinine            0
AST                         0
ALT                         0
GGT                         0
smoker_status               0
drinker                     0
NO_HDL                      0
AST_ALT                     0
BMI                         0
hypertension                0
diabetes                    0
dtype: int64

In [55]:
# Detect duplicated values
data.duplicated().sum()
# Drop duplicated values
data.drop_duplicates(inplace=True)
# Verify if duplicated values were dropped
data.duplicated().sum()

0

In [56]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sex,979089.0,0.527085,0.499266,0.0,0.0,1.0,1.0,1.0
age,979089.0,47.637125,14.208497,20.0,35.0,45.0,60.0,85.0
height,979089.0,162.185925,9.283383,130.0,155.0,160.0,170.0,190.0
weight,979089.0,63.181509,12.470856,25.0,55.0,60.0,70.0,140.0
waistline,979089.0,81.10743,9.580558,27.0,74.0,81.0,87.4,149.1
systolic_blood_pressure,979089.0,122.346252,14.516173,67.0,112.0,120.0,131.0,273.0
diastolic_blood_pressure,979089.0,75.985745,9.863042,32.0,70.0,76.0,81.0,185.0
glucose_fasting,979089.0,100.184705,23.61051,42.0,88.0,96.0,105.0,852.0
total_cholesterol,979089.0,195.100876,37.672464,54.0,169.0,193.0,219.0,492.0
HDL,979089.0,57.063122,14.804796,11.0,46.0,55.0,66.0,209.0


In [57]:
# Scale data
from scripts.manipulation.scale_data import scale_data

data_scaled = scale_data(data, method='standard')

In [58]:
# Use a logit model to evaluate the importance of each variable with statsmodels
import statsmodels.api as sm
from statsmodels.formula.api import logit

# Separate the target variable from the features
y = data_scaled['drinker']
X = data_scaled.drop(['drinker'], axis=1)

# Create the model
model = sm.Logit(y, X)

# Fit the model
result = model.fit()

# Print the summary
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.545228
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                drinker   No. Observations:               979089
Model:                          Logit   Df Residuals:                   979067
Method:                           MLE   Df Model:                           21
Date:                Thu, 19 Oct 2023   Pseudo R-squ.:                  0.2134
Time:                        09:48:02   Log-Likelihood:            -5.3383e+05
converged:                       True   LL-Null:                   -6.7863e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
sex                          0.3249      0.005     67.406      0.000       0.315

In [59]:
from scripts.analysis.multicollinearity import detect_multicollinearity

detect_multicollinearity(data=data_scaled)


Unnamed: 0,Feature,VIF,Multicollinearity
0,sex,3.618564,No
1,age,1.858164,No
2,height,48.193009,Yes
3,weight,134.857496,Yes
4,waistline,4.250579,No
5,systolic_blood_pressure,2.707209,No
6,diastolic_blood_pressure,2.56277,No
7,glucose_fasting,2.233924,No
8,total_cholesterol,inf,Yes
9,HDL,inf,Yes


In [60]:
# drop variables with multicollinearity

data_scaled.drop(['weight', 'height', 'total_cholesterol', ], axis=1, inplace=True)

In [61]:
detect_multicollinearity(data=data_scaled)

Unnamed: 0,Feature,VIF,Multicollinearity
0,sex,3.27994,No
1,age,1.396328,No
2,waistline,3.665523,No
3,systolic_blood_pressure,2.703443,No
4,diastolic_blood_pressure,2.557353,No
5,glucose_fasting,2.223903,No
6,HDL,1.404363,No
7,triglyceride,1.61575,No
8,hemoglobin,1.802993,No
9,urine_protein,1.063324,No


In [62]:
# Now we will save the data in a new csv file

data_scaled.to_csv('../data/processed/clean.csv', index=False)