<a href="https://colab.research.google.com/github/ferygood/LLM_behavior_prediction/blob/main/01_create_simulation_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simulate user data

In [2]:
import pandas as pd
import numpy as np

In [6]:
# simulate web visit data
web_visit_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=1000),
    'visit_time': pd.date_range(start='2023-01-01', periods=1000, freq='T'),
    'page_url': np.random.choice(['home', 'product', 'cart', 'checkout'], size=1000),
    'referrer_url': np.random.choice(['google', 'facebook', 'twitter', 'direct'], size=1000)
})

web_visit_data.head()

Unnamed: 0,user_id,visit_time,page_url,referrer_url
0,408,2023-01-01 00:00:00,product,direct
1,107,2023-01-01 00:01:00,product,facebook
2,684,2023-01-01 00:02:00,home,facebook
3,493,2023-01-01 00:03:00,home,direct
4,129,2023-01-01 00:04:00,product,google


In [9]:
# simulate purchase data
purchase_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=500),
    'purchase_time': pd.date_range(start='2023-01-01', periods=500, freq='2T'),
    'product_id': np.random.randint(1, 100, size=500),
    'amount': np.random.uniform(10, 500, size=500)
})

purchase_data.head()

Unnamed: 0,user_id,purchase_time,product_id,amount
0,132,2023-01-01 00:00:00,1,305.985839
1,783,2023-01-01 00:02:00,92,465.180504
2,957,2023-01-01 00:04:00,1,231.165823
3,761,2023-01-01 00:06:00,94,481.479275
4,575,2023-01-01 00:08:00,65,414.658857


In [10]:
# simulate social media interaction data
social_interaction_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=300),
    'interaction_time': pd.date_range(start='2023-01-01', periods=300, freq='5T'),
    'platform': np.random.choice(['facebook', 'twitter', 'instagram'], size=300),
    'action': np.random.choice(['like', 'share', 'comment'], size=300)
})

social_interaction_data.head()

Unnamed: 0,user_id,interaction_time,platform,action
0,384,2023-01-01 00:00:00,instagram,like
1,342,2023-01-01 00:05:00,twitter,like
2,490,2023-01-01 00:10:00,twitter,comment
3,602,2023-01-01 00:15:00,instagram,like
4,323,2023-01-01 00:20:00,twitter,like


We have three simulated data, `web_visit_data`, `purchase_data`, and `social_interaction_data`. The next step will be cleaning data and pre-process,

In [12]:
# removing duplicated data
web_visit_data.drop_duplicates(inplace=True)
purchase_data.drop_duplicates(inplace=True)
social_interaction_data.drop_duplicates(inplace=True)

# fill NA
web_visit_data.fillna('unknown', inplace=True)
purchase_data['amount'].fillna(purchase_data['amount'].mean(), inplace=True)
social_interaction_data.fillna('unknown', inplace=True)

# convert time format
web_visit_data['visit_time'] = pd.to_datetime(web_visit_data['visit_time'])
purchase_data['purchase_time'] = pd.to_datetime(purchase_data['purchase_time'])
social_interaction_data['interaction_time'] = pd.to_datetime(social_interaction_data['interaction_time'])

In [13]:
# create time feature for new columns
web_visit_data['visit_date'] = web_visit_data['visit_time'].dt.date
web_visit_data['visit_hour'] = web_visit_data['visit_time'].dt.hour

purchase_data['purchase_date'] = purchase_data['purchase_time'].dt.date
purchase_data['purchase_hour'] = purchase_data['purchase_time'].dt.hour

social_interaction_data['interaction_date'] = social_interaction_data['interaction_time'].dt.date
social_interaction_data['interaction_hour'] = social_interaction_data['interaction_time'].dt.hour

# 輸出處理後的數據
print("Web visit data:\n", web_visit_data.head())
print("Purchase data:\n", purchase_data.head())
print("Social interaction data:\n", social_interaction_data.head())

Web visit data:
    user_id          visit_time page_url referrer_url  visit_date  visit_hour
0      408 2023-01-01 00:00:00  product       direct  2023-01-01           0
1      107 2023-01-01 00:01:00  product     facebook  2023-01-01           0
2      684 2023-01-01 00:02:00     home     facebook  2023-01-01           0
3      493 2023-01-01 00:03:00     home       direct  2023-01-01           0
4      129 2023-01-01 00:04:00  product       google  2023-01-01           0
Purchase data:
    user_id       purchase_time  product_id      amount purchase_date  \
0      132 2023-01-01 00:00:00           1  305.985839    2023-01-01   
1      783 2023-01-01 00:02:00          92  465.180504    2023-01-01   
2      957 2023-01-01 00:04:00           1  231.165823    2023-01-01   
3      761 2023-01-01 00:06:00          94  481.479275    2023-01-01   
4      575 2023-01-01 00:08:00          65  414.658857    2023-01-01   

   purchase_hour  
0              0  
1              0  
2              

In [None]:
# save the data as csv files for next step
web_visit_data.to_csv('data/web_visit_data.csv', index=False)
purchase_data.to_csv('data/purchase_data.csv', index=False)
social_interaction_data.to_csv('data/social_interaction_data.csv', index=False)