In [21]:
# %matplotlib inline
import pandas  as pd  
import numpy as np
import re
import math
import os;
import sys;
 
sys.path.insert(0, os.path.abspath('../'))
from health.utils import list_files,slugify

%reload_ext autoreload
%autoreload 2


TARGET='Depression'

"""
Column Names
,id,
Name,
Gender,
Age,
City,
Working Professional or Student,
Profession,
Academic Pressure,
Work Pressure,
CGPA,
Study Satisfaction,
Job Satisfaction,
Sleep Duration,
Dietary Habits,
Degree,
Have you ever had suicidal thoughts ?,
Work/Study Hours,
Financial Stress,
Family History of Mental Illness,
Depression

"""

allowed_cols = [
    'id',
    # 'Name',
    'Gender', # Cleaned
    'Age', # Cleand
    'City', #Cleaned
    'Working Professional or Student', # Clean data
    'Profession', # Minor erros need not fix as statistically numbers are not great
    'Academic Pressure', # Data is clean
    'Work Pressure', # Data is clean
    'CGPA', # Cleaned
    'Study Satisfaction', #data is clean
    'Job Satisfaction', #data is clean
    'Sleep Duration', # Cleaned
    'Dietary Habits', # minor issues not statistically relevant
    'Degree',# minor issues not statistically relevant
    'Have you ever had suicidal thoughts ?', # Data  is clean
    'Work/Study Hours', # Clean
    'Financial Stress', # clean
    'Family History of Mental Illness', #Clean
    'Depression',
]



In [22]:
"""
Following are utility functions created for listing files in a directory and generate a usable key when random values are present in column names

"""
def slugify(s):
    s = s.lower().strip()
    s = re.sub(r'[^A-Za-z0-9]+', '-', string=s)
    return s

def list_files(directory):
    absolute_paths = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            absolute_paths.append(os.path.abspath(filepath))
    return absolute_paths
# print(f)

Cleans up the columns from raw data to generate processed data
Iterate through all folders in the raw folder and generates corresponding cleaned files in the

Actions done
1. Cleans up CGPA to buckets
2. Cleans up Sleep Duration
3. Cleans up values for City, Degree,Dietary Habits, Profession

In [23]:

def cleanup_CGPA(val):
    # if isinstance(float(val),float):
    #     return math.floor(float(val)*10)/10
    num_val = float(val)
    if math.isnan(num_val) or val == "":
        return -1
    return math.floor(float(num_val) * 10) / 10


Cleans up random sleep values to a number and default to 6 hours of sleep

In [24]:

two_digit_pattern = r"(\d+){1}\s*[^\d]{1}+(\d+)"
single_number_pattern = r".*([\d]{2})"

def cleanup_sleep(sleep):
    if sleep == "More than 8 hours":
        return 8
    elif sleep == "Less than 5 hours":
        return 4.5
    elif sleep == "Moderate":
        return 6
    elif re.match(two_digit_pattern, sleep):
        matches = re.findall(two_digit_pattern, sleep)
        num1, num2 = matches[0]
        # return sleep
        total = (int(num1) + int(num2)) / 2
        if total < 10:
            return total
        return 6

    else:
        # print('Sleep non confirmant value', sleep)
        # raise Exception(f'Invalid value for sleep duration {sleep}')
        return 6


Cleanup with an allowed list of values to remove error data ,everything not found is bucketed to other or a default value

In [25]:


def cleanup_with_list(name, allowed=[]):
    res = allowed[allowed["name"] == name]
    # if (!res) :
    if len(res) > 0:
        return name
    default = allowed[allowed["count"] == 0] 
    lst = default['name'][0:1].to_list()
    return lst[0]


In [26]:
def cleanup(val, col, allowed=[]):

    if col == "CGPA":
        return cleanup_CGPA(val)
    elif col == "Sleep Duration":
        return cleanup_sleep(val)
    elif len(allowed) > 0:
        return cleanup_with_list(val, allowed)
    else:
        return val


def clean_data(input, column_names):
    result = pd.DataFrame()
    result["id"] = input["id"]
    for col in column_names:
        try:
            sl = slugify(col)
            allowed = []
            allowed_file = f"../data/interim/{sl}.csv"
            if os.path.isfile(allowed_file):
                print(f"reading {allowed_file}")
                allowed = pd.read_csv(filepath_or_buffer=allowed_file)
                # print('Checking vals : ',allowed)
            result[sl] = input[col].apply(lambda x: cleanup(x, col, allowed=allowed))
            # break
        except Exception as err:
            print("error occured", err)
    return result


def create_reports(input, column_names):
    for col in column_names:
        try: 
            stats = input[col].value_counts(sort=False)
            # print(stats)
            stats.to_csv(f"../reports/{col}.csv")
            # break
        except Exception as err:
            print("error occured while stats", err)


def clean_all_files():
    raw_files = list_files("../data/raw")
     
    for f in raw_files:
        if not (".csv" in f):
            continue
        print(f"processing '{f}' ")
        df = pd.read_csv(
            f, usecols=allowed_cols, dtype={"Depression": str, "CGPA": str, "Sleep Duration": str}
        )
        column_names = df.keys().to_list()
        column_names.remove("id")
        result = clean_data(df, column_names)
        create_reports(result, column_names=result.columns)
        result.to_csv(f.replace("/raw/", "/processed/"))
        print(f"Completed '{f}' ") 

clean_all_files()

processing '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/test.csv' 
reading ../data/interim/city.csv
reading ../data/interim/profession.csv
reading ../data/interim/dietary-habits.csv
reading ../data/interim/degree.csv
Completed '/Users/ceegees/workspace/iisc-learning/are-you-depressed/data/raw/test.csv' 
