In [None]:
import pandas as pd
import numpy as np
import ast
from functools import reduce
from sklearn.ensemble import IsolationForest
import json

In [None]:
'''
given the source/checkin_checkout_history_updated.csv file, identify the users and its corresponding weeks where it has been acting differently/unusual
steps:
1. process the data to group by (user, week) and get {total visits, total length of visits, total calories burnt}
    - can use map reduce to do this, or pandas <(user, week), (total visits, total length of visits, total calories burnt)>
2. run isoloation forest on the data to identify the outliers for anomoly detection
'''

In [None]:
df = pd.read_csv('../data/_raw/checkin_checkout_history_updated.csv')
df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])
df['session_seconds'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds()
df['week'] = df['checkin_time'].dt.isocalendar().week
# sometimes days in january counted as week 52 so I just changed them to week 0
df.loc[(df['checkin_time'].dt.month == 1) & (df['week'] == 52), 'week'] = 0
df.drop(['gym_id', 'checkin_time', 'checkout_time', 'workout_type'], axis=1, inplace=True)
# lets us know the most recent week we have data from, useful for computing average weekly activity
CONSTANT_LAST_WEEK = df['week'].max()
CONSTANT_FIRST_WEEK = df['week'].min()
df

In [None]:
# The next 2 functions are the basic map-reduce to transform the data into grouping by user/week
# and counting how many times they visited in a week + total time + total calories burned

# map each entry to key=id+week and value=numVisits,length,calories
def mapFunc_groupByUserWeek(row):
    _, row_data = row
    user_id = row_data['user_id']
    calories = int(row_data['calories_burned'])
    seconds = int(row_data['session_seconds'])
    week = int(row_data['week'])
    return (str(user_id) + "-" + str(week), (1, seconds, calories))

# reduce by summing, to calculate users gym activity by week
def reduceFunc_groupByUserWeek(acc, pair):
    key, value = pair
    if key in acc:
        acc[key] = tuple(map(lambda x, y: x + y, acc[key], value))
    else:
        # i think they're already ints after the changes I made above
        acc[key] = (int(value[0]), int(value[1]), int(value[2]))
    return acc   


# if a user doesn't go in a whole week, we still need an empty entry for that week for our anomaly detection
# so I'm going to find the first week each user started going to the gym so we can add every week after

# map each entry to key=id and value=week
def mapFunc_addEmptyWeeks(row):
    _, entry = row
    key = entry['user_id']
    value = int(entry['week'])
    return (key, value)

# reduce by taking the first week they started going to the gym
def reduceFunc_findFirstWeek(acc, pair):
  
    #TODO find first and last week. make the values a tuple of (first, last)
    key, value = pair
    if key in acc:
        acc[key] = min(acc[key] , value)
    else:
        acc[key] = value
    return acc

def reduceFunc_findWeekRange(acc, pair):
    key, value = pair
    if key in acc:
        newBot = min(acc[key][0] , value)
        newTop = max(acc[key][1] , value)
        acc[key] = (newBot, newTop)
    else:
        acc[key] = (value, value)
    return acc

# this reduce function will add all the empty weeks 
# and the output will be the starting dictionary inputted to reduceFunc_groupByUserWeek
def reduceFunc_addEmptyWeeks(acc, pair):
    user, week_range = pair
    bot, top = week_range
    for i in range(int(bot), int(top) + 1):
        acc[user + '-' + str(i)] = (0, 0, 0)
    return acc

In [None]:
# setting up empty weeks first
mappedData_addEmptyWeeks = list(map(mapFunc_addEmptyWeeks, df.iterrows()))

userlyWeekRange = reduce(reduceFunc_findWeekRange, mappedData_addEmptyWeeks, {})
emptyWeeks = reduce(reduceFunc_addEmptyWeeks, userlyWeekRange.items(), {})


In [None]:
# now adding all the actual information to the "empty weeks" dict returned from above cell
mappedData_groupByUserWeek = list(map(mapFunc_groupByUserWeek, df.iterrows()))

weekly_userly_data = reduce(reduceFunc_groupByUserWeek, mappedData_groupByUserWeek, emptyWeeks)
weekly_userly_data_dict = dict(weekly_userly_data)

In [None]:
# csv_df = pd.DataFrame.from_dict(reduced_data, orient='index', columns=['total_sessions', 'total_session_seconds', 'total_calories'])
# # csv_df = csv_df.reset_index()

# csv_df.to_csv('processed_data.csv', index=True)
# csv_df

# storing both dict version and other version NOTE: do not need to store in files as we can just use the variables
# with open("groupByUserWeek.json", "w") as file:
#     json.dump(weekly_userly_data, file)

# with open("groupByUserWeek_dict.json", "w") as file:
#     json.dump(weekly_userly_data_dict, file)

In [None]:
# turn jsons into a df, then run the anomoly forest on it

transformed_data = [
    (*key.split('-'), *values) for key, values in weekly_userly_data.items()
]

cleaned_dataframe = pd.DataFrame(transformed_data, columns=['user_id', 'week', 'total_sessions', 'total_session_seconds', 'total_calories'])
cleaned_dataframe.set_index(['user_id', 'week'], inplace=True)

cleaned_dataframe

In [None]:
cleaned_dataframe_with_anomoly = cleaned_dataframe.copy()
model = IsolationForest(contamination=0.05, random_state=42)
cleaned_dataframe_with_anomoly['anomaly'] = model.fit_predict(cleaned_dataframe_with_anomoly)

In [None]:
# after performing anomaly detection we want to see how relevent each anomaly is
# going to use a modified TF.IDF where each document is a users list of anomaly weeks
# and each word is a week, but instead of the week appearing a lot, we're going to use distance from mean
# this will allow us to see how extreme of an anomaly it is, and if it's common among all users
# perhaps everyones gym usage went down cause there was a holiday

def mapFunc_findUserMean(row):
    key_old,value_old = row
    user, week = key_old.split('-')
    #TODO add 2 versions of week to represent min and max
    value_new = (int(week), value_old[0], value_old[1], value_old[2])
    return (user, value_new)

def reduceFunc_sumStats_minWeek(acc, pair):
    key, value = pair
    if key in acc:
        #take minimum week to find when they started, sum rest of the values
        #TODO also take max in the other part
        acc[key] = ( min(acc[key][0] , value[0]) , 
                    max(acc[key][4] , value[0]),
                    acc[key][2] + value[1] , 
                    acc[key][3] + value[2] , 
                    acc[key][4] + value[3] )
    else:
        acc[key] = (value[0], value[0], value[1], value[2], value[3])

    return acc
        
def reduceFunc_averageStats(acc, pair):
    key, value = pair
    #TODO make value also contain 
    numWeeks = value[1] - value[0] + 1
    acc[key] = (value[2] / numWeeks, value[3] / numWeeks, value[4] / numWeeks)
    return acc


In [None]:
mappedData_findUserMean = list(map(mapFunc_findUserMean, weekly_userly_data_dict.items()))

weekly_userly_total = reduce(reduceFunc_sumStats_minWeek, mappedData_findUserMean, {})
weekly_userly_mean = reduce(reduceFunc_averageStats, dict(weekly_userly_total).items(), {})

weekly_userly_mean_dict = dict(weekly_userly_mean)

#print(weekly_userly_mean_dict)

In [None]:
with open("averageByUser.json", "w") as file:
    json.dump(weekly_userly_mean, file)

with open("averageByUser_dict.json", "w") as file:
    json.dump(weekly_userly_mean_dict, file)


In [None]:
# Okay for TF.IDF the TF part is supposed to be a number between 0-1 but distance from mean might not be
# So I think I'm going to find the max distance from the minimum box containing the data and compute the 
# max distance from any vertex to mean and divide the actual distance by that

# then we need to actually comput IDF for each week 
# which doesn't need to be repeated for each user since it's always the same

# then we need to compute TF' for each user/week
    