# Assignment1

## TASK 1: DATA PREPARATION

### TASK 1A: EXPLORATORY DATA ANALYSIS

#### 1. Library import & Global var definition

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = 'dataset/dataset_mood_smartphone.csv'
save_path = 'dataset/dataset_mood_smartphone_reshape.csv'

In [None]:
continuous_variables = [
    'circumplex.arousal',
    'circumplex.valence',
    'activity',
    # 'screen',
    # 'appCat.builtin',
    # 'appCat.communication',
    'appCat.entertainment',
    # 'appCat.finance',
    # 'appCat.game',
    # 'appCat.office',
    # 'appCat.other',
    'appCat.social',
    # 'appCat.travel',
    # 'appCat.unknown',
    'appCat.utilities',
    'appCat.weather'
]

categorical_variables = ['call', 'sms']

#### 2. Reading and Preparing the Data

In [None]:
def read_csv(file_path):
    data = pd.read_csv(file_path)
    return data

def time_to_datetime(data):
    data['time'] = pd.to_datetime(data['time']).dt.date
    return data

def reshape_data_with_id(data):
    # data = time_to_datetime(data)
    df = pd.DataFrame(data)
    # reshape the data
    df_pivoted = df.pivot_table(index=['id', 'time'], columns='variable', values='value').reset_index()
    return df_pivoted

def reshape_data(data):
    data = time_to_datetime(data)
    df = pd.DataFrame(data)
    # reshape the data
    df_pivoted = df.pivot_table(index='time', columns='variable', values='value').reset_index()
    return df_pivoted

def calculate_daily_mood_mean(data):
    df = pd.DataFrame(data)
    daily_mood_mean = df.groupby(['id','time','variable'])['value'].mean().reset_index()
    return daily_mood_mean

def save_to_csv(data, file_path):
    data.to_csv(file_path, index=False)

In [None]:
# read data
data = read_csv(file_path)
# convert time to date
data = time_to_datetime(data)
# calculate daily average
data = calculate_daily_mood_mean(data)


# reshape data without id column
data_without_id = reshape_data(data)
# reshape data with id column
data_with_id = reshape_data_with_id(data)

# save the processed date
save_to_csv(data_with_id, save_path)

#### 3. Data Analysis and Visualization

**Defining Functions**

In [None]:
# Time Series Plot for mood over time
def plot_time_series(data, x, y):
    plt.figure(figsize=(15,5))
    sns.lineplot(data=data, x=x, y=y)
    plt.title('Mood Over Time')
    plt.xlabel('Time')
    plt.ylabel('Mood Score')
    plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability
    plt.show()

# Bar chart for mood vs. call
def plot_bar_chart(data, x, y):
    sns.barplot(data=data, x=x, y=y)
    plt.title('Mood and Call Made')
    plt.xlabel('Call Made (0 = No, 1 = Yes)')
    plt.ylabel('Average Mood Score')
    plt.show()

# boxplot for classification variables
def plot_boxplot(data):
    fig, axes = plt.subplots(nrows=1, ncols=len(categorical_variables), figsize=(10, 5))

    for i, var in enumerate(categorical_variables):
        sns.boxplot(x=data[var], y=data['mood'], ax=axes[i])
        axes[i].set_title(f'Mood by {var}')
        axes[i].set_xlabel(var)
        axes[i].set_ylabel('Mood')

    plt.tight_layout()
    plt.show()

# scatter plot for continuous variables
def plot_scatter(data):
    for var in continuous_variables:
        fig, ax1 = plt.subplots()

        color = 'tab:red'
        ax1.set_xlabel('time')
        ax1.set_ylabel('mood', color=color)
        ax1.plot(data['time'], data['mood'], color=color)
        ax1.tick_params(axis='y', labelcolor=color)

        ax2 = ax1.twinx()
        color = 'tab:blue'
        ax2.set_ylabel(var, color=color)
        ax2.scatter(data['time'], data[var], color=color)
        ax2.tick_params(axis='y', labelcolor=color)

        fig.tight_layout()
        plt.show()

# heatmap for correlation matrix
def plot_heatmap(data):
    correlation_matrix = data[continuous_variables + ['mood']].corr()
    sns.heatmap(correlation_matrix, annot=True)
    plt.title('Correlation Matrix')
    plt.figure(figsize=(16, 8))
    plt.show()

plot_scatter(data_without_id)
plot_heatmap(data_with_id)

### TASK 1B: DATA CLEANING
**In this step, we choose two approaches to impute missing values that are logical for such time series:**


1.   Approach1(TODO)
2.   Approach2(TODO)



#### 1. Processing data to 3 time period

In [None]:
# Load the data from a CSV file
data_resample = read_csv(file_path)

# Convert the 'time' to datetime format and add a new column 'hour'
data_resample['time'] = pd.to_datetime(data_resample['time'])
data_resample['hour'] = data_resample['time'].dt.hour

# Define the bins and labels for the time periods
bins = [0, 8, 16, 24]
labels = ['0-8', '8-16', '16-24']

# Create a new column 'time_period' that categorizes 'hour' into different time periods
data_resample['time_period'] = pd.cut(data_resample['hour'], bins=bins, labels=labels, right=False)

# Convert the 'time' column to datetime format again to ensure it's in the correct format
time_to_datetime(data_resample)

# Group the data by 'id', 'time', 'time_period', and 'variable', and calculate the mean of 'value'
data_grouped = data_resample.groupby(['id','time','time_period','variable'])['value'].mean().reset_index()

# Reshape the data using pivot_table so that each 'variable' becomes a column
data = data_grouped.pivot_table(index=['id', 'time','time_period'], columns='variable', values='value').reset_index()
# print(data.head())

# Save the reshaped data to a new CSV file
save_to_csv(data, 'dataset/dataset_3_period.csv')

#### 2. Clean the rows only have call and sms

In [None]:
# Define the columns to check
columns_to_check = ['id', 'time', 'time_period','sms', 'call']

# Create a mask where for each row, if 'id' and 'time' are not NA and either 'sms' or 'call' is not NA, the row is True
mask = data[['id', 'time','time_period']].notna().all(axis=1) & data[['sms', 'call']].notna().any(axis=1)

# Invert the mask, to get rows where either 'id', 'time' are NA or both 'sms' and 'call' are NA
mask = ~mask

# Apply the mask to the dataframe
data = data[mask]

# Save the result to a new csv
data.to_csv('dataset/cleaned_data.csv', index=False)

#### 3. Observe that NA only happens on valence and arousal.

In [None]:
# read data from csv
df = pd.read_csv('dataset/dataset_mood_smartphone.csv')
# df = pd.read_csv('Assignment1/dataset/cleaned_data.csv')


variables = df['variable'].unique()

# boxplot for each attribute with all patients
for var in variables:
    plt.figure(figsize=(10, 6))

    # create a subset of the current attribute
    subset = df[df['variable'] == var]

    # plot a boxplot, where id is on the x-axis and value is on the y-axis
    sns.boxplot(x='id', y='value', data=subset, showfliers=True)

    # set the title and labels
    plt.title(f'Boxplot for Variable: {var}')
    plt.xlabel('Patient ID')
    plt.ylabel('Value')

    # show the plot
    plt.show()

### TASK 1C: FEATURE ENGINEERING

In [None]:
1111
