In [19]:
# %matplotlib inline
import pandas  as pd  
import numpy as np
import re
import math
import os; 
  
from health.utils import list_files,slugify, DTYPE_DICT

%reload_ext autoreload
%autoreload 2

TARGET='Depression'




Cleans up the columns from raw data to generate processed data
Iterate through all folders in the raw folder and generates corresponding cleaned files in the

Actions done
1. Cleans up CGPA to buckets
2. Cleans up Sleep Duration
3. Cleans up values for City, Degree,Dietary Habits, Profession

In [20]:

def cleanup_CGPA(val):
    # if isinstance(float(val),float):
    #     return math.floor(float(val)*10)/10
    num_val = float(val)
    if math.isnan(num_val) or val == "":
        return -1
    return math.floor(float(num_val) * 10) / 10


Cleans up random sleep values to a number and default to 6 hours of sleep

In [21]:

two_digit_pattern = r"(\d+){1}\s*[^\d]{1}+(\d+)"
single_number_pattern = r".*([\d]{2})"

def cleanup_sleep(sleep):
    if sleep == "More than 8 hours":
        return 8
    elif sleep == "Less than 5 hours":
        return 4.5
    elif sleep == "Moderate":
        return 6
    elif re.match(two_digit_pattern, sleep):
        matches = re.findall(two_digit_pattern, sleep)
        num1, num2 = matches[0]
        # return sleep
        total = (int(num1) + int(num2)) / 2
        if total < 10:
            return total
        return 6

    else:
        # print('Sleep non confirmant value', sleep)
        # raise Exception(f'Invalid value for sleep duration {sleep}')
        return 6


Cleanup with an allowed list of values to remove error data ,everything not found is bucketed to other or a default value

In [22]:


def cleanup_with_lables(name, allowed=[]):
    if name == '':
        return '';
    if name in allowed:
        return name 
    return allowed[-1]
     


In [33]:
## Check unique labels on each cols
raw_files = list_files("../data/raw")

df_list = []
for f in raw_files:
    if not (".csv" in f):
        continue

    d = pd.read_csv(
        f, dtype= DTYPE_DICT
    )
    df_list.append(d)
            # Combine the list of dataframes
df =  pd.concat(df_list)
cols = df.columns.to_list() 
cols.remove("id")
cols.remove("Name")

for col in cols:
    sr = df[col].value_counts()
    sr.to_csv(f'../reports/raw/vals_{slugify(col)}.csv')



In [35]:
def cleanup(val, col, labels=[]):

    if col == "CGPA":
        return cleanup_CGPA(val)
    elif col == "Sleep Duration":
        return cleanup_sleep(val)
    # elif col == ''
    elif len(labels) > 0:
        return cleanup_with_lables(val, labels)
    else:
        return val

 

def clean_data(input, column_names):
    result = pd.DataFrame()
    result["id"] = input["id"]
    for col in column_names:
        try:
            sl = slugify(col)
            labels = []
            label_file = f"../health/labels/{sl}.csv"
            if os.path.isfile(label_file):
                df = pd.read_csv(filepath_or_buffer=label_file) 
                labels = df['name'].unique()
                print(f"reading {label_file} : ",len(labels))
            result[col] = input[col].apply(lambda x: cleanup(x, col, labels=labels))
            # break
        except Exception as err:
            print("error occured", err)
    return result



def clean_all_files():
    raw_files = list_files("../data/raw")
     
    for f in raw_files:
        if not (".csv" in f):
            continue

        target = f.replace("/raw/", "/processed/v1/")
        if (os.path.isfile(target)):
            print(f"File '{target}' already exists")
            continue;
        print(f"processing '{f}' ")
        df = pd.read_csv(
            f, dtype= DTYPE_DICT
        )
        column_names = df.keys().to_list()
        column_names.remove("id")
        column_names.remove("Name")
        # result = clean_data(df, column_names) 

        result = clean_data(df, column_names) 
        result.to_csv(target,index=False)
        print(f"Completed '{f}' ") 

clean_all_files()

processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/kaggle-test.csv' 
reading ../health/labels/city.csv :  31
reading ../health/labels/profession.csv :  37
reading ../health/labels/dietary-habits.csv :  3
reading ../health/labels/degree.csv :  28
Completed '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/kaggle-test.csv' 
processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/test.csv' 
reading ../health/labels/city.csv :  31
reading ../health/labels/profession.csv :  37
reading ../health/labels/dietary-habits.csv :  3
reading ../health/labels/degree.csv :  28
Completed '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/test.csv' 
processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-4.csv' 
reading ../health/labels/city.csv :  31
reading ../health/labels/profession.csv :  37
reading ../health/labels/dietary-habits.csv :  3
reading ../health/labels/degree.csv :  28
Completed