In [1]:
import os
os.environ['KAGGLE_USERNAME'] = "jarnorantaharju"
os.environ['KAGGLE_KEY'] = "1c36d632e191d13cdda5450b1396f020"

In [2]:
import pandas as pd
import niimpy
from niimpy.exploration.eda import countplot
from niimpy.preprocessing import survey
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
api = KaggleApi()
api.authenticate()

In [3]:
api.dataset_download_files('dartweichen/student-life', path=".")
archive = zipfile.ZipFile('student-life.zip', 'r')
activity_data = []
for user_number in range(60):
    user = f"u{user_number:02}"
    try:
        csvfile = archive.open(f"dataset/sensing/activity/activity_{user}.csv")
        user_activity = pd.read_csv(csvfile)
        user_activity["user"] = user
        activity_data.append(user_activity)
    except:
        pass
activity_data = pd.concat(activity_data)

activity_data["activity"] = activity_data[" activity inference"].astype(object)
activity_data.set_index('timestamp',inplace=True)
activity_data.index = pd.to_datetime(activity_data.index, unit='s')

In [4]:
activity_data = niimpy.util.aggregate(activity_data, "1H")
activity_data = activity_data.reset_index("user")

In [5]:
csvfile = archive.open(f"dataset/survey/PHQ-9.csv")
survey_data = pd.read_csv(csvfile)
survey_data = survey_data.rename(columns={'uid': 'user'})

In [6]:
PHQ9_MAP = {
    'Little interest or pleasure in doing things': "PHQ9_1",
    'Feeling down, depressed, hopeless.': "PHQ9_2",
    'Trouble falling or staying asleep, or sleeping too much.': "PHQ9_3",
    'Feeling tired or having little energy': "PHQ9_4",
    'Poor appetite or overeating': "PHQ9_5",
    'Feeling bad about yourself or that you are a failure or have let yourself or your family down': "PHQ9_6",
    'Trouble concentrating on things, such as reading the newspaper or watching television': "PHQ9_7",
    'Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual': "PHQ9_8",
    'Thoughts that you would be better off dead, or of hurting yourself': "PHQ9_9",
}
PHQ9_ANSWER_MAP = {
    "Not at all": 0,
    "Several days": 1,
    "More than half the days": 2,
    "Nearly every day": 3
}
selected_cols = [col for col in survey_data.columns if col in PHQ9_MAP.keys()]
transformed_df = pd.melt(survey_data, id_vars=['user', 'type'], value_vars=selected_cols, var_name='question', value_name='raw_answer')
transformed_df['id'] = transformed_df['question'].replace(PHQ9_MAP)
transformed_df['answer'] = survey.survey_convert_to_numerical_answer(
    transformed_df, answer_col = 'raw_answer', question_id = 'id',
    id_map={"PHQ9": PHQ9_ANSWER_MAP}, use_prefix=True
)

# The dataframe should usually be indexed by a datetime object, but
# the type string also works and indicated time in a sense.
transformed_df = transformed_df.set_index("type")
scores = survey.survey_sum_scores(transformed_df, "PHQ9")

In [7]:
def PHQ9_sum_to_group(sum):
    if sum < 5:
        return "minimal"
    elif sum < 10:
        return "mild"
    elif sum < 15:
        return "moderate"
    elif sum < 20:
        return "moderately severe"
    else:
        return "severe"
scores = scores.reset_index()
scores = scores[scores["type"] == "pre"]
scores["group"] = scores["score"].apply(PHQ9_sum_to_group)

In [8]:
activity_data.head()

Unnamed: 0_level_0,user,activity inference,activity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-03-27 04:00:00,u00,0.0,0.0
2013-03-27 05:00:00,u00,0.0,0.0
2013-03-27 06:00:00,u00,0.060519,0.0
2013-03-27 07:00:00,u00,0.0,0.0
2013-03-27 08:00:00,u00,0.0,0.0


In [9]:
activity_data = activity_data.reset_index().merge(
    scores[["user", "group"]],
    how="inner",
    on="user",
).set_index("timestamp")
activity_data.head()

Unnamed: 0_level_0,user,activity inference,activity,group
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-03-27 04:00:00,u00,0.0,0.0,minimal
2013-03-27 05:00:00,u00,0.0,0.0,minimal
2013-03-27 06:00:00,u00,0.060519,0.0,minimal
2013-03-27 07:00:00,u00,0.0,0.0,minimal
2013-03-27 08:00:00,u00,0.0,0.0,minimal


In [10]:
fig = countplot.countplot(activity_data,
                          fig_title='Group level activity score distributions', 
                          plot_type='count', 
                          points='outliers',
                          aggregation='user', 
                          user=None,
                          column=' activity inference',
                          binning=False)

fig.show()

In [12]:
activity_data.head()

Unnamed: 0_level_0,user,activity inference,activity,group
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-03-27 04:00:00,u00,0.0,0.0,minimal
2013-03-27 05:00:00,u00,0.0,0.0,minimal
2013-03-27 06:00:00,u00,0.060519,0.0,minimal
2013-03-27 07:00:00,u00,0.0,0.0,minimal
2013-03-27 08:00:00,u00,0.0,0.0,minimal


In [None]:
import niimpy
niimpy.reading.read.read_mhealth_sleep_duration("mhealth_test.json")


{'sleep_duration': {'value': 7.5, 'unit': 'h'}, 'effective_time_frame': {'time_interval': {'start_date_time': '2016-02-05T20:35:00Z', 'end_date_time': '2016-02-06T06:35:00Z'}}}
