In [2]:
# %matplotlib inline
import pandas  as pd  
import numpy as np
import re
import math
import os;
import sys;
  
from health.utils import list_files,slugify, ALLOWED_COLS

%reload_ext autoreload
%autoreload 2


TARGET='Depression'

"""
Column Names
,id,
Name,
Gender,
Age,
City,
Working Professional or Student,
Profession,
Academic Pressure,
Work Pressure,
CGPA,
Study Satisfaction,
Job Satisfaction,
Sleep Duration,
Dietary Habits,
Degree,
Have you ever had suicidal thoughts ?,
Work/Study Hours,
Financial Stress,
Family History of Mental Illness,
Depression

"""




[32m2024-11-18 20:29:24.916[0m | [1mINFO    [0m | [36mhealth.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/ceegees/workspace/iisc-learning/are-you-depressed[0m


'\nColumn Names\n,id,\nName,\nGender,\nAge,\nCity,\nWorking Professional or Student,\nProfession,\nAcademic Pressure,\nWork Pressure,\nCGPA,\nStudy Satisfaction,\nJob Satisfaction,\nSleep Duration,\nDietary Habits,\nDegree,\nHave you ever had suicidal thoughts ?,\nWork/Study Hours,\nFinancial Stress,\nFamily History of Mental Illness,\nDepression\n\n'

Cleans up the columns from raw data to generate processed data
Iterate through all folders in the raw folder and generates corresponding cleaned files in the

Actions done
1. Cleans up CGPA to buckets
2. Cleans up Sleep Duration
3. Cleans up values for City, Degree,Dietary Habits, Profession

In [3]:

def cleanup_CGPA(val):
    # if isinstance(float(val),float):
    #     return math.floor(float(val)*10)/10
    num_val = float(val)
    if math.isnan(num_val) or val == "":
        return -1
    return math.floor(float(num_val) * 10) / 10


Cleans up random sleep values to a number and default to 6 hours of sleep

In [4]:

two_digit_pattern = r"(\d+){1}\s*[^\d]{1}+(\d+)"
single_number_pattern = r".*([\d]{2})"

def cleanup_sleep(sleep):
    if sleep == "More than 8 hours":
        return 8
    elif sleep == "Less than 5 hours":
        return 4.5
    elif sleep == "Moderate":
        return 6
    elif re.match(two_digit_pattern, sleep):
        matches = re.findall(two_digit_pattern, sleep)
        num1, num2 = matches[0]
        # return sleep
        total = (int(num1) + int(num2)) / 2
        if total < 10:
            return total
        return 6

    else:
        # print('Sleep non confirmant value', sleep)
        # raise Exception(f'Invalid value for sleep duration {sleep}')
        return 6


Cleanup with an allowed list of values to remove error data ,everything not found is bucketed to other or a default value

In [5]:


def cleanup_with_list(name, allowed=[]):
    res = allowed[allowed["name"] == name]
    # if (!res) :
    if len(res) > 0:
        return name
    default = allowed[allowed["count"] == 0] 
    lst = default['name'][0:1].to_list()
    return lst[0]


In [8]:
def cleanup(val, col, allowed=[]):

    if col == "CGPA":
        return cleanup_CGPA(val)
    elif col == "Sleep Duration":
        return cleanup_sleep(val)
    elif len(allowed) > 0:
        return cleanup_with_list(val, allowed)
    else:
        return val


def clean_data(input, column_names):
    result = pd.DataFrame()
    result["id"] = input["id"]
    for col in column_names:
        try:
            sl = slugify(col)
            allowed = []
            allowed_file = f"../data/interim/{sl}.csv"
            if os.path.isfile(allowed_file):
                print(f"reading {allowed_file}")
                allowed = pd.read_csv(filepath_or_buffer=allowed_file)
                # print('Checking vals : ',allowed)
            result[col] = input[col].apply(lambda x: cleanup(x, col, allowed=allowed))
            # break
        except Exception as err:
            print("error occured", err)
    return result



def clean_all_files():
    raw_files = list_files("../data/raw")
     
    for f in raw_files:
        if not (".csv" in f):
            continue

        target = f.replace("/raw/", "/processed/")
        if (os.path.isfile(target)):
            print(f"File {f} already exists")
            continue;
        print(f"processing '{f}' ")
        df = pd.read_csv(
            f, dtype={"Depression": str, "CGPA": str, "Sleep Duration": str}
        )
        column_names = df.keys().to_list()
        column_names.remove("id")
        column_names.remove("Name")
        result = clean_data(df, column_names) 
        result.to_csv(target)
        print(f"Completed '{f}' ") 

clean_all_files()

File /Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/kaggle-test.csv already exists
File /Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/test.csv already exists
File /Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-4.csv already exists
File /Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-0.csv already exists
File /Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-1.csv already exists
processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-3.csv' 
reading ../data/interim/city.csv
reading ../data/interim/profession.csv
reading ../data/interim/dietary-habits.csv
reading ../data/interim/degree.csv
Completed '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-3.csv' 
processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/train-2.csv' 
reading ../data/interim/city.csv
reading ../data/interim/profession.csv
reading ../d