In [81]:
import pandas as pd
import numpy as np
import ast
from functools import reduce
from sklearn.ensemble import IsolationForest

In [14]:
'''
given the source/checkin_checkout_history_updated.csv file, identify the users and its corresponding weeks where it has been acting differently/unusual
steps:
1. process the data to group by (user, week) and get {total visits, total length of visits, total calories burnt}
    - can use map reduce to do this, or pandas <(user, week), (total visits, total length of visits, total calories burnt)>
2. run isoloation forest on the data to identify the outliers for anomoly detection
'''

'\ngiven the source/checkin_checkout_history_updated.csv file, identify the users and its corresponding weeks where it has been acting differently/unusual\nsteps:\n1. process the data to group by (user, week) and get {total visits, total length of visits, total calories burnt}\n    - can use map reduce to do this, or pandas <(user, week), (total visits, total length of visits, total calories burnt)>\n2. run isoloation forest on the data to identify the outliers for anomoly detection\n'

In [None]:
df = pd.read_csv('../source/checkin_checkout_history_updated.csv')
df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])
df['session_seconds'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds()
df['week'] = df['checkin_time'].dt.isocalendar().week
df.drop(['gym_id', 'checkin_time', 'checkout_time', 'workout_type'], axis=1, inplace=True)
df

Unnamed: 0,user_id,calories_burned,session_seconds,week
0,3291,462,2340.0,36
1,1944,1278,9360.0,15
2,958,858,5100.0,23
3,811,1134,10200.0,21
4,4923,1049,6120.0,8
...,...,...,...,...
299995,3995,288,2640.0,31
299996,206,1935,10200.0,26
299997,4983,1312,4380.0,14
299998,1028,787,3420.0,9


In [88]:
# fixed
df = pd.read_csv('../source/checkin_checkout_history_updated.csv')
df['user_id'] = df['user_id'].str.split('_').str[1].astype(int)
df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])
df['session_seconds'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds()
df['week'] = df['checkin_time'].dt.isocalendar().week
df.drop(['gym_id', 'checkin_time', 'checkout_time', 'workout_type'], axis=1, inplace=True)
df

Unnamed: 0,user_id,calories_burned,session_seconds,week
0,3291,462,2340.0,36
1,1944,1278,9360.0,15
2,958,858,5100.0,23
3,811,1134,10200.0,21
4,4923,1049,6120.0,8
...,...,...,...,...
299995,3995,288,2640.0,31
299996,206,1935,10200.0,26
299997,4983,1312,4380.0,14
299998,1028,787,3420.0,9


In [None]:
def rowToTuple(row):
    _, row_data = row
    user_id = row_data['user_id']
    calories = row_data['calories_burned']
    seconds = row_data['session_seconds']
    week = row_data['week']
    return (str(user_id) + "-" + str(week), (1, seconds, calories))

def reduceTuple(accumulator, pair):
    key, value = pair
    if key in accumulator:
        accumulator[key] = tuple(map(lambda x, y: x + y, accumulator[key], value))
        #accumulator[key] = (accumulator[key][0] + int(value[0]), accumulator[key][1] + int(value[1]), accumulator[key][2] + int(value[2]))
    else:
        accumulator[key] = (int(value[0]), int(value[1]), int(value[2]))
    return accumulator   


In [106]:
mapped_data = map(rowToTuple, df.iterrows())
reduced_data = reduce(reduceTuple, mapped_data, {})
reduced_data

{'3291-36': (2, 8520, 1312),
 '1944-15': (3, 18000, 2725),
 '958-23': (3, 22020, 2194),
 '811-21': (1, 10200, 1134),
 '4923-8': (2, 9900, 2799),
 '1534-5': (3, 22380, 3256),
 '4431-23': (3, 14940, 3635),
 '410-12': (1, 7140, 751),
 '54-14': (2, 13740, 1289),
 '4797-7': (3, 18840, 2776),
 '1099-37': (2, 11040, 907),
 '2397-30': (1, 7320, 454),
 '975-18': (4, 29340, 3977),
 '4489-14': (3, 15000, 2039),
 '4650-18': (2, 8220, 2082),
 '2895-6': (1, 6360, 737),
 '3280-19': (1, 8160, 556),
 '4922-22': (2, 9660, 2646),
 '4840-12': (4, 22200, 4037),
 '2225-14': (3, 21780, 2105),
 '2559-8': (1, 9660, 730),
 '3691-41': (1, 9600, 1636),
 '139-4': (2, 7620, 1017),
 '3174-5': (2, 18420, 2364),
 '2521-28': (1, 9300, 1192),
 '4638-24': (5, 39780, 3981),
 '2242-3': (3, 15180, 3323),
 '3828-6': (4, 18120, 3292),
 '1338-32': (2, 18840, 660),
 '4090-18': (5, 25680, 3829),
 '2853-18': (2, 15300, 1197),
 '2295-29': (2, 15660, 1402),
 '2678-34': (2, 6540, 1516),
 '73-3': (1, 2100, 667),
 '2879-14': (2, 9480,

In [107]:
csv_df = pd.DataFrame.from_dict(reduced_data, orient='index', columns=['total_sessions', 'total_session_seconds', 'total_calories'])
# csv_df = csv_df.reset_index()

csv_df.to_csv('processed_data.csv', index=True)
csv_df

Unnamed: 0,total_sessions,total_session_seconds,total_calories
3291-36,2,8520,1312
1944-15,3,18000,2725
958-23,3,22020,2194
811-21,1,10200,1134
4923-8,2,9900,2799
...,...,...,...
3421-4,1,9720,555
2864-21,1,5580,292
2131-11,1,7200,1066
997-34,1,4560,421


In [57]:
bruh = map(rowToTuple, df.iterrows())
# lol = reduce(reduceTuple, bruh, {})

In [None]:
# user_id, week, total_visits, total_length, total_calories
aggregated = df.groupby(['user_id', 'week']).agg(
    total_calories_burned=('calories_burned', 'sum'),
    total_sessions=('user_id', 'count'),
    total_session_seconds=('session_seconds', 'sum')
)
aggregated.head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_calories_burned,total_sessions,total_session_seconds
user_id,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user_1,1,3816,4,26040.0
user_1,2,975,1,6840.0
user_1,3,1715,3,21360.0
user_1,5,403,1,3060.0
user_1,6,1243,1,5580.0
...,...,...,...,...
user_1000,6,764,1,6960.0
user_1000,9,913,2,10440.0
user_1000,11,653,1,3180.0
user_1000,14,2464,3,26520.0


In [None]:
model = IsolationForest(contamination=0.05, random_state=42)
aggregated['anomaly'] = model.fit_predict(aggregated)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_calories_burned,total_sessions,total_session_seconds,anomaly
user_id,week,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
user_1,11,5769,6,47760.0,-1
user_1,20,5191,5,40680.0,-1
user_1,21,5463,4,21780.0,-1
user_1,32,3325,5,42960.0,-1
user_10,9,5182,4,23460.0,-1
...,...,...,...,...,...
user_994,37,5376,4,24000.0,-1
user_995,8,2948,5,24360.0,-1
user_995,13,3444,4,33180.0,-1
user_995,35,2971,4,35220.0,-1


In [None]:


df['checkin_time'] = pd.to_datetime(df['checkin_time'])
df['checkout_time'] = pd.to_datetime(df['checkout_time'])

# Extract additional info (assuming columns for 'user', 'calories_burnt' exist)
df['visit_length'] = (df['checkout_time'] - df['checkin_time']).dt.total_seconds() / 60  # in minutes
df['week'] = df['checkin_time'].dt.isocalendar().week  # Week of the year

# Step 1: Group by user and week
weekly_stats = df.groupby(['user', 'week']).agg(
    total_visits=('checkin_time', 'count'),
    total_length_of_visits=('visit_length', 'sum'),
    total_calories_burnt=('calories_burnt', 'sum')
).reset_index()

# Step 2: Apply Isolation Forest for anomaly detection
X = weekly_stats[['total_visits', 'total_length_of_visits', 'total_calories_burnt']]

# Initialize IsolationForest model
model = IsolationForest(contamination=0.05, random_state=42)
weekly_stats['anomaly'] = model.fit_predict(X)

# Label outliers (anomalies) as 'unusual' when the model assigns -1
weekly_stats['is_unusual'] = weekly_stats['anomaly'] == -1

# Filter unusual activity
unusual_activity = weekly_stats[weekly_stats['is_unusual']]

# Output the results
print("Unusual activities detected:")
print(unusual_activity[['user', 'week', 'total_visits', 'total_length_of_visits', 'total_calories_burnt']])


ModuleNotFoundError: No module named 'sklearn'