In [None]:
import pandas as pd
import numpy as np
import ast
from functools import reduce
from sklearn.ensemble import IsolationForest
import json

In [14]:
'''
given the source/checkin_checkout_history_updated.csv file, identify the users and its corresponding weeks where it has been acting differently/unusual
steps:
1. process the data to group by (user, week) and get {total visits, total length of visits, total calories burnt}
    - can use map reduce to do this, or pandas <(user, week), (total visits, total length of visits, total calories burnt)>
2. run isoloation forest on the data to identify the outliers for anomoly detection
'''

'\ngiven the source/checkin_checkout_history_updated.csv file, identify the users and its corresponding weeks where it has been acting differently/unusual\nsteps:\n1. process the data to group by (user, week) and get {total visits, total length of visits, total calories burnt}\n    - can use map reduce to do this, or pandas <(user, week), (total visits, total length of visits, total calories burnt)>\n2. run isoloation forest on the data to identify the outliers for anomoly detection\n'

In [None]:
df = pd.read_csv('../source/checkin_checkout_history_updated.csv')
df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])
df['session_seconds'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds()
df['week'] = df['checkin_time'].dt.isocalendar().week
# sometimes days in january counted as week 52 so I just changed them to week 0
df.loc[(df['checkin_time'].dt.month == 1) & (df['week'] == 52), 'week'] = 0
df.drop(['gym_id', 'checkin_time', 'checkout_time', 'workout_type'], axis=1, inplace=True)
# lets us know the most recent week we have data from, useful for computing average weekly activity
CONSTANT_LAST_WEEK = df['week'].max()
df

Unnamed: 0,user_id,calories_burned,session_seconds,week
0,3291,462,2340.0,36
1,1944,1278,9360.0,15
2,958,858,5100.0,23
3,811,1134,10200.0,21
4,4923,1049,6120.0,8
...,...,...,...,...
299995,3995,288,2640.0,31
299996,206,1935,10200.0,26
299997,4983,1312,4380.0,14
299998,1028,787,3420.0,9


In [88]:
# fixed
df = pd.read_csv('../source/checkin_checkout_history_updated.csv')
df['user_id'] = df['user_id'].str.split('_').str[1].astype(int)
df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])
df['session_seconds'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds()
df['week'] = df['checkin_time'].dt.isocalendar().week
df.drop(['gym_id', 'checkin_time', 'checkout_time', 'workout_type'], axis=1, inplace=True)
df

Unnamed: 0,user_id,calories_burned,session_seconds,week
0,3291,462,2340.0,36
1,1944,1278,9360.0,15
2,958,858,5100.0,23
3,811,1134,10200.0,21
4,4923,1049,6120.0,8
...,...,...,...,...
299995,3995,288,2640.0,31
299996,206,1935,10200.0,26
299997,4983,1312,4380.0,14
299998,1028,787,3420.0,9


In [None]:
# The next 2 functions are the basic map-reduce to transform the data into grouping by user/week
# and counting how many times they visited in a week + total time + total calories burned

# map each entry to key=id+week and value=numVisits,length,calories
def mapFunc_groupByUserWeek(row):
    _, row_data = row
    user_id = row_data['user_id']
    calories = int(row_data['calories_burned'])
    seconds = int(row_data['session_seconds'])
    week = int(row_data['week'])
    return (str(user_id) + "-" + str(week), (1, seconds, calories))

# reduce by summing, to calculate users gym activity by week
def reduceFunc_groupByUserWeek(acc, pair):
    key, value = pair
    if key in acc:
        acc[key] = tuple(map(lambda x, y: x + y, acc[key], value))
    else:
        # i think they're already ints after the changes I made above
        acc[key] = (int(value[0]), int(value[1]), int(value[2]))
    return acc   


# if a user doesn't go in a whole week, we still need an empty entry for that week for our anomaly detection
# so I'm going to find the first week each user started going to the gym so we can add every week after

# map each entry to key=id and value=week
def mapFunc_addEmptyWeeks(row):
    _, entry = row
    key = entry['user_id']
    value = int(entry['week'])
    return (key, value)

# reduce by taking the first week they started going to the gym
def reduceFunc_findFirstWeek(acc, pair):
    key, value = pair
    if key in acc:
        acc[key] = min(acc[key] , value)
    else:
        acc[key] = value
    return acc

# this reduce function will add all the empty weeks 
# and the output will be the starting dictionary inputted to reduceFunc_groupByUserWeek
def reduceFunc_addEmptyWeeks(acc, pair):
    user, firstWeek = pair
    for i in range(firstWeek, CONSTANT_LAST_WEEK + 1):
        acc[user + ',' + str(i)] = (0, 0, 0)
    return acc

In [None]:
# setting up empty weeks first
mappedData_addEmptyWeeks = list(map(mapFunc_addEmptyWeeks, df.iterrows()))

userlyFirstWeek = reduce(reduceFunc_findFirstWeek, mappedData_addEmptyWeeks, {})
emptyWeeks = reduce(reduceFunc_addEmptyWeeks, userlyFirstWeek.items(), {})

#mapped_data = map(rowToTuple, df.iterrows())
#reduced_data = reduce(reduceTuple, mapped_data, {})
#reduced_data

In [None]:
# now adding all the actual information to the "empty weeks" dict returned from above cell
mappedData_groupByUserWeek = list(map(mapFunc_groupByUserWeek, df.iterrows()))

weekly_userly_data = reduce(reduceFunc_groupByUserWeek, mappedData_groupByUserWeek, emptyWeeks)
weekly_userly_data_dict = dict(weekly_userly_data)

In [None]:
# csv_df = pd.DataFrame.from_dict(reduced_data, orient='index', columns=['total_sessions', 'total_session_seconds', 'total_calories'])
# # csv_df = csv_df.reset_index()

# csv_df.to_csv('processed_data.csv', index=True)
# csv_df

# storing both dict version and other version 
with open("groupByUserWeek.json", "w") as file:
    json.dump(weekly_userly_data, file)

with open("groupByUserWeek_dict.json", "w") as file:
    json.dump(weekly_userly_data_dict, file)

Unnamed: 0,total_sessions,total_session_seconds,total_calories
3291-36,2,8520,1312
1944-15,3,18000,2725
958-23,3,22020,2194
811-21,1,10200,1134
4923-8,2,9900,2799
...,...,...,...
3421-4,1,9720,555
2864-21,1,5580,292
2131-11,1,7200,1066
997-34,1,4560,421


In [None]:
# bruh = map(rowToTuple, df.iterrows())
# lol = reduce(reduceTuple, bruh, {})

In [None]:
# user_id, week, total_visits, total_length, total_calories
aggregated = df.groupby(['user_id', 'week']).agg(
    total_calories_burned=('calories_burned', 'sum'),
    total_sessions=('user_id', 'count'),
    total_session_seconds=('session_seconds', 'sum')
)
aggregated.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_calories_burned,total_sessions,total_session_seconds
user_id,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user_1,1,3816,4,26040.0
user_1,2,975,1,6840.0
user_1,3,1715,3,21360.0
user_1,5,403,1,3060.0
user_1,6,1243,1,5580.0
...,...,...,...,...
user_1000,6,764,1,6960.0
user_1000,9,913,2,10440.0
user_1000,11,653,1,3180.0
user_1000,14,2464,3,26520.0


In [None]:
model = IsolationForest(contamination=0.05, random_state=42)
aggregated['anomaly'] = model.fit_predict(aggregated)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_calories_burned,total_sessions,total_session_seconds,anomaly
user_id,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
user_1,11,5769,6,47760.0,-1
user_1,20,5191,5,40680.0,-1
user_1,21,5463,4,21780.0,-1
user_1,32,3325,5,42960.0,-1
user_10,9,5182,4,23460.0,-1
...,...,...,...,...,...
user_994,37,5376,4,24000.0,-1
user_995,8,2948,5,24360.0,-1
user_995,13,3444,4,33180.0,-1
user_995,35,2971,4,35220.0,-1


In [None]:


df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])

# Extract additional info (assuming columns for 'user', 'calories_burnt' exist)
df['visit_length'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds() / 60  # in minutes
df['week'] = df['checkin_time'].dt.isocalendar().week  # Week of the year

# Step 1: Group by user and week
weekly_stats = df.groupby(['user', 'week']).agg(
    total_visits=('checkin_time', 'count'),
    total_length_of_visits=('visit_length', 'sum'),
    total_calories_burnt=('calories_burnt', 'sum')
).reset_index()

# Step 2: Apply Isolation Forest for anomaly detection
X = weekly_stats[['total_visits', 'total_length_of_visits', 'total_calories_burnt']]

# Initialize IsolationForest model
model = IsolationForest(contamination=0.05, random_state=42)
weekly_stats['anomaly'] = model.fit_predict(X)

# Label outliers (anomalies) as 'unusual' when the model assigns -1
weekly_stats['is_unusual'] = weekly_stats['anomaly'] == -1

# Filter unusual activity
unusual_activity = weekly_stats[weekly_stats['is_unusual']]

# Output the results
print("Unusual activities detected:")
print(unusual_activity[['user', 'week', 'total_visits', 'total_length_of_visits', 'total_calories_burnt']])


ModuleNotFoundError: No module named 'sklearn'

In [None]:
# after performing anomaly detection we want to see how relevent each anomaly is
# going to use a modified TF.IDF where each document is a users list of anomaly weeks
# and each word is a week, but instead of the week appearing a lot, we're going to use distance from mean
# this will allow us to see how extreme of an anomaly it is, and if it's common among all users
# perhaps everyones gym usage went down cause there was a holiday

def mapFunc_findUserMean(row):
    key_old,value_old = row
    user, week = key_old.split(',')
    value_new = (int(week), value_old[0], value_old[1], value_old[2])
    return (user, value_new)

def reduceFunc_sumStats_minWeek(acc, pair):
    key, value = pair
    if key in acc:
        #take minimum week to find when they started, sum rest of the values
        acc[key] = ( min(acc[key][0] , value[0]) , 
                    acc[key][1] + value[1] , 
                    acc[key][2] + value[2] , 
                    acc[key][3] + value[3] )
    else:
        acc[key] = (value[0], value[1], value[2], value[3])

    return acc
        
def reduceFunc_averageStats(acc, pair):
    key, value = pair
    numWeeks = CONSTANT_LAST_WEEK - value[0] + 1
    acc[key] = (value[1] / numWeeks, value[2] / numWeeks, value[3] / numWeeks)
    return acc


In [None]:
mappedData_findUserMean = list(map(mapFunc_findUserMean, weekly_userly_data_dict.items()))

weekly_userly_total = reduce(reduceFunc_sumStats_minWeek, mappedData_findUserMean, {})
weekly_userly_mean = reduce(reduceFunc_averageStats, dict(weekly_userly_total).items(), {})

weekly_userly_mean_dict = dict(weekly_userly_mean)

#print(weekly_userly_mean_dict)

In [None]:
with open("averageByUser.json", "w") as file:
    json.dump(weekly_userly_mean, file)

with open("averageByUser_dict.json", "w") as file:
    json.dump(weekly_userly_mean_dict, file)


In [None]:
# Okay for TF.IDF the TF part is supposed to be a number between 0-1 but distance from mean might not be
# So I think I'm going to find the max distance from the minimum box containing the data and compute the 
# max distance from any vertex to mean and divide the actual distance by that

# then we need to actually comput IDF for each week 
# which doesn't need to be repeated for each user since it's always the same

# then we need to compute TF' for each user/week
    