In [None]:
# Import necessary modules
import data_preprocessor as dp
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 1. Load the dataset
messy_data = pd.read_csv('../Data/messy_data.csv')

# Examine data
print(messy_data.head())
messy_data.info()
print(messy_data.describe())

clean_data = messy_data.copy()

# 2. Preprocess the data

# remove columns that have too many missing values
missing_data = clean_data.isnull().sum() / len(clean_data) # calculate ratio of missing values per column
print(missing_data)

drop_cols = missing_data[missing_data > 0.6].index # create an index of columns that exceed threshold
clean_data.drop(drop_cols, axis = 1, inplace = True) # remove qualifying columns

print(clean_data.shape)

clean_data = dp.impute_missing_values(clean_data, strategy='mean')
clean_data.info()

clean_data = dp.remove_duplicates(clean_data)
print(clean_data.shape)

clean_data = dp.normalize_data(clean_data)
print(clean_data.head())

clean_data = dp.remove_redundant_features(clean_data)
print(clean_data.shape)


# 3. Save the cleaned dataset
# clean_data.to_csv('../Data/clean_data.csv', index=False)

# 4. Train and evaluate the model
dp.simple_model(clean_data)

   target               a         b   c                  d          e  \
0     0.0  lv hypertrophy  0.531368  63       fixed defect  Cleveland   
1     1.0  lv hypertrophy -1.056253  67             normal  Cleveland   
2     1.0  lv hypertrophy  0.438407  67  reversable defect  Cleveland   
3     0.0          normal -1.413430  37             normal  Cleveland   
4     0.0  lv hypertrophy -1.347216  41             normal  Cleveland   

          f                g      h      i  ...          r       s    t  \
0  0.285812   typical angina  233.0   True  ...  60.627689    Male  2.3   
1  1.660947     asymptomatic  286.0  False  ...  65.225088    Male  1.5   
2 -0.305235     asymptomatic  229.0  False  ...  66.956699    Male  2.6   
3  0.887054      non-anginal  250.0  False  ...  37.160246    Male  3.5   
4 -0.558037  atypical angina  204.0  False  ...  41.359067  Female  1.4   

           u      v         w    x          y   z      {  
0  62.306825  145.0 -1.007833  0.0  61.730527 NaN  