In [472]:
import pandas as pd
import pickle

In [473]:
################################################
################################################
################################################
################################################
# USER DATA #
################################################
################################################
################################################
################################################

In [474]:
## Original JSON files are not formatted ###
## Formatted with process_date.py ##
## Loading newly formatted data ##
with open("/Users/evro/Documents/code/python/fetch/data/cleaned/cleaned_users.json") as f:
    users_data = pd.read_json(f)
users = pd.DataFrame(users_data)

In [475]:
## Reviewing data
users.head()

Unnamed: 0,_id,active,created_date,last_login,role,sign_up_source,state
0,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
1,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
2,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI
3,{'$oid': '5ff1e1eacfcf6c399c274ae6'},True,{'$date': 1609687530554},{'$date': 1609687530597},consumer,Email,WI
4,{'$oid': '5ff1e194b6a9d73a3a9f1052'},True,{'$date': 1609687444800},{'$date': 1609687537858},consumer,Email,WI


In [476]:
users.dtypes

_id               object
active              bool
created_date      object
last_login        object
role              object
sign_up_source    object
state             object
dtype: object

In [477]:
######################## USER DATA NORMALIZE & CLEANING ########################

In [478]:
users.rename(columns={'_id': 'user_id'}, inplace=True)

In [479]:
# Convert user id to string
users["user_id"] = users["user_id"].apply(lambda x: x.get("$oid", str(x)))

In [480]:
# Convert createdDate to string
users["created_date"] = users["created_date"].apply(lambda x: x.get("$date", str(x)))

In [481]:
# Tried running a conversion for lastLogin but received errors, so I'm investigating to see data types for last login
users["last_login"].apply(type).value_counts()

last_login
<class 'dict'>     433
<class 'float'>     62
Name: count, dtype: int64

In [482]:
# Let's see which are floats in the lastLogin column
users[users["last_login"].apply(lambda x: isinstance(x, float))]

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
97,5ff616a68f142f11dd189163,True,1609963174996,,consumer,Email,KY
143,5ffe115404929101d0aaebb2,True,1610486100208,,consumer,Email,AL
148,5ffe115404929101d0aaebb2,True,1610486100208,,consumer,Email,AL
170,5e27526d0bdb6a138c32b556,True,1579635309795,,consumer,Google,WI
180,6002475cfb296c121a81b98d,True,1610762076571,,consumer,Email,WI
...,...,...,...,...,...,...,...
381,60186237c8b50e11d8454d5f,True,1612210743551,,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1612210743551,,consumer,Email,
389,60217fa799409b11fcf899fe,True,1612808103714,,consumer,Email,WI
420,5fb0a078be5fc9775c1f3945,True,1605410936818,,consumer,Google,AL


In [483]:
# Get the value of "$date" if lastLogin is a dict otherwise return value
users["last_login"] = users["last_login"].apply(lambda x: x.get("$date") if isinstance(x, dict) else x)

In [484]:
# review data after clean
users.head()

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
0,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609688000000.0,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609688000000.0,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609688000000.0,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1609687530554,1609688000000.0,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,1609687444800,1609688000000.0,consumer,Email,WI


In [485]:
# Convert created_date and last_login to dates
users["created_date"] = pd.to_datetime(users["created_date"], errors="coerce")
users["last_login"] = pd.to_datetime(users ["last_login"], errors="coerce")

In [486]:
## Review after cleaned dates
users.head()

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
0,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI


In [487]:
# Check data types
users.dtypes

user_id                   object
active                      bool
created_date      datetime64[ns]
last_login        datetime64[ns]
role                      object
sign_up_source            object
state                     object
dtype: object

In [488]:
# Convert objects to strings
users = users.astype({col: "string" for col in users.select_dtypes(include=["object"]).columns})

In [489]:
users["user_id"] = users["user_id"].astype("string")

In [490]:
users.dtypes

user_id           string[python]
active                      bool
created_date      datetime64[ns]
last_login        datetime64[ns]
role              string[python]
sign_up_source    string[python]
state             string[python]
dtype: object

In [491]:
users.head()
# Why are there bullets in the active bool?

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
0,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
1,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
2,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
4,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI


In [492]:
######################## END USER DATA CLEANING/NORMALIZATION ########################

In [493]:
######################## VALIDATE USER DATA ########################

In [494]:
# Are there null values?
users.isnull().sum()

user_id            0
active             0
created_date       0
last_login        62
role               0
sign_up_source    48
state             56
dtype: int64

In [495]:
# Ensure only True/False values exist.
users['active'].value_counts()

active
True     494
False      1
Name: count, dtype: int64

In [496]:
# Which roles are there?
users["role"].value_counts()

role
consumer       413
fetch-staff     82
Name: count, dtype: Int64

In [497]:
# Which users do not have a state?
users[users["state"].isnull()]

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
344,60145ff384231211ce796d51,True,1970-01-01 00:26:51.948019722,NaT,consumer,Email,
350,60145ff384231211ce796d51,True,1970-01-01 00:26:51.948019722,NaT,consumer,Email,
375,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
376,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
378,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
381,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
422,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,
423,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,
424,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,


In [498]:
# Ensure only True/False values exist
users['active'].value_counts()

active
True     494
False      1
Name: count, dtype: int64

In [499]:
######################## END VALIDATE USER DATA ########################

In [500]:
######################## DETECT USER ISSUES ########################

In [501]:
# Check for duplicate user records where all column values are identical.
users[users.duplicated()].sort_values(by="user_id")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
494,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
476,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
477,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
478,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
479,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
...,...,...,...,...,...,...,...
365,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
373,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
374,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
387,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI


In [502]:
# How many dups do we have of users and last login?
users.duplicated(subset=["user_id", "last_login"]).sum()

283

In [503]:
# Check for duplicate user records based on user ID and last login timestamp.
# This helps identify multiple entries for the same user with the same last login time. Data dups?
users[users.duplicated(subset=["user_id", "last_login"], keep=False)].sort_values("last_login")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
18,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
13,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
11,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
0,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
...,...,...,...,...,...,...,...
378,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
381,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
420,5fb0a078be5fc9775c1f3945,True,1970-01-01 00:26:45.410936818,NaT,consumer,Google,AL


In [504]:
# Find users with same last login. May hint towards possible abuse or bots.
users[users.duplicated(subset=["last_login"], keep=False)].sort_values("last_login")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
18,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
3,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
13,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
11,5ff1e1eacfcf6c399c274ae6,True,1970-01-01 00:26:49.687530554,1970-01-01 00:26:49.687530597,consumer,Email,WI
0,5ff1e194b6a9d73a3a9f1052,True,1970-01-01 00:26:49.687444800,1970-01-01 00:26:49.687537858,consumer,Email,WI
...,...,...,...,...,...,...,...
381,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
389,60217fa799409b11fcf899fe,True,1970-01-01 00:26:52.808103714,NaT,consumer,Email,WI
420,5fb0a078be5fc9775c1f3945,True,1970-01-01 00:26:45.410936818,NaT,consumer,Google,AL


In [505]:
# Let's find user id dups with dup dates
users[users.duplicated(subset=["user_id"], keep=False)].sort_values("user_id")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
494,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
475,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
476,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
477,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
478,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
...,...,...,...,...,...,...,...
365,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
387,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
385,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
394,60229990b57b8a12187fe9e0,True,1970-01-01 00:26:52.880272581,1970-01-01 00:26:52.880272626,consumer,Email,WI


In [506]:
# Can we detect abnormal activity? If multiple users have same state, source, and id it could indicate abuse.
users.groupby(["state", "sign_up_source"])["user_id"].count().sort_values(ascending=False)

state  sign_up_source
WI     Email             376
NH     Email              20
AL     Email               9
OH     Email               5
AL     Google              3
IL     Email               2
CO     Email               1
KY     Email               1
SC     Email               1
WI     Google              1
Name: user_id, dtype: int64

In [507]:
# Show states that are not valid
valid_states = {'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
                'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
                'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
                'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
                'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'}

users[~users['state'].isin(valid_states)]

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
344,60145ff384231211ce796d51,True,1970-01-01 00:26:51.948019722,NaT,consumer,Email,
350,60145ff384231211ce796d51,True,1970-01-01 00:26:51.948019722,NaT,consumer,Email,
375,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
376,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
378,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
381,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
422,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,
423,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,
424,5a43c08fe4b014fd6b6a0612,True,1970-01-01 00:25:14.389647059,1970-01-01 00:26:53.146957155,consumer,,


In [508]:
# Which users do not have Google or Email as source?
expected_sources = {"Google", "Email"}
users[~users["sign_up_source"].isin(expected_sources)]

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
388,55308179e4b0eabd8f99caa2,True,1970-01-01 00:23:49.242233186,1970-01-01 00:25:25.713820003,consumer,,WI
395,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
396,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
397,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
398,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
399,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
400,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
401,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
402,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI
403,59c124bae4b0299e55b0f330,True,1970-01-01 00:25:05.830074302,1970-01-01 00:26:52.802578117,fetch-staff,,WI


In [509]:
# Which users do not have a last login?
users[users["last_login"].isnull()]

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
97,5ff616a68f142f11dd189163,True,1970-01-01 00:26:49.963174996,NaT,consumer,Email,KY
143,5ffe115404929101d0aaebb2,True,1970-01-01 00:26:50.486100208,NaT,consumer,Email,AL
148,5ffe115404929101d0aaebb2,True,1970-01-01 00:26:50.486100208,NaT,consumer,Email,AL
170,5e27526d0bdb6a138c32b556,True,1970-01-01 00:26:19.635309795,NaT,consumer,Google,WI
180,6002475cfb296c121a81b98d,True,1970-01-01 00:26:50.762076571,NaT,consumer,Email,WI
...,...,...,...,...,...,...,...
381,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
382,60186237c8b50e11d8454d5f,True,1970-01-01 00:26:52.210743551,NaT,consumer,Email,
389,60217fa799409b11fcf899fe,True,1970-01-01 00:26:52.808103714,NaT,consumer,Email,WI
420,5fb0a078be5fc9775c1f3945,True,1970-01-01 00:26:45.410936818,NaT,consumer,Google,AL


In [510]:
# Detect duplicate user records where both user ID and sign-up source are the same.
# This could indicate users signing up multiple times with the same source.
users[users.duplicated(subset=["user_id", "sign_up_source"], keep=False)].sort_values("user_id")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
494,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
475,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
476,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
477,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
478,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
...,...,...,...,...,...,...,...
365,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
387,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
385,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
394,60229990b57b8a12187fe9e0,True,1970-01-01 00:26:52.880272581,1970-01-01 00:26:52.880272626,consumer,Email,WI


In [511]:
# Identify users who have the same account creation date.
# This could detect bulk sign-ups or potential bots if many users were created at the exact same time.
users[users.duplicated(subset=["created_date"], keep=False)].sort_values("created_date")

Unnamed: 0,user_id,active,created_date,last_login,role,sign_up_source,state
494,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
475,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
476,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
477,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
478,54943462e4b07e684157a532,True,1970-01-01 00:23:38.998882381,1970-01-01 00:26:54.963143204,fetch-staff,,
...,...,...,...,...,...,...,...
365,60189c94c8b50e11d8454f6b,True,1970-01-01 00:26:52.225684020,1970-01-01 00:26:52.225684073,consumer,Email,WI
387,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
385,601c2c05969c0b11f7d0b097,True,1970-01-01 00:26:52.459013700,1970-01-01 00:26:52.459230228,consumer,Email,WI
394,60229990b57b8a12187fe9e0,True,1970-01-01 00:26:52.880272581,1970-01-01 00:26:52.880272626,consumer,Email,WI


In [512]:
import os

output_dir = "/Users/evro/Documents/code/python/fetch/data/validated"
os.makedirs(output_dir, exist_ok=True)

# Save the pickle file in the specified directory
output_file_pkl = os.path.join(output_dir, "users.pkl")
with open(output_file_pkl, "wb") as f:
    pickle.dump(users, f)