# Random Dataset for Recommend System

### Random Users

In [64]:
def generate_random_date_of_birth(start_year, end_year):
    # Define the range of years
    start_date = datetime(year=start_year, month=1, day=1)
    end_date = datetime(year=end_year, month=12, day=31)

    # Calculate the difference in days
    delta_days = (end_date - start_date).days

    # Generate a random number of days to add to the start date
    random_days = random.randint(0, delta_days)

    # Generate the random date
    random_date = start_date + timedelta(days=random_days)

    return random_date.strftime('%Y-%m-%d')

def generate_random_phone_number():
    return f'0{random.randint(100000000, 999999999)}'

num_users = 1988  # Number of user sample data

users = [generate_random_phone_number() for _ in range(num_users)]

data = []
for _ in range(num_users):
    userId = random.choice(users)
    gender = random.choice(['Male', 'Female'])
    birth_date = generate_random_date_of_birth(1970, 2015)
    data.append({
        '_id': userId,
        'gender': gender,
        'birthday': birth_date,
    })

# Create DataFrame and save to CSV file
df = pd.DataFrame(data)
df.to_csv('data_user.csv', index=False, encoding='utf-8-sig')

print("Data created and saved to csv file")

Data created and saved to csv file


### Generate Dataset

In [65]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import json

amazon_dataset = pd.read_csv('./amazon_reviews_dataset.csv', nrows=2000)
user_data = pd.read_csv('./data_user.csv').drop_duplicates(subset='_id')
item_data = pd.read_json('./data_item.json').drop_duplicates(subset='_id')

### User handler
user_ids_available = user_data['_id'].tolist()
unique_users = amazon_dataset['user'].unique()
user_mapping = {user: np.random.choice(user_ids_available) for user in unique_users}
amazon_dataset['userID_new'] = amazon_dataset['user'].map(user_mapping)

# Kết hợp thông tin bổ sung cho userID
df_user_info_mapping = user_data.set_index('_id').to_dict(orient='index')
amazon_dataset['birthday'] = amazon_dataset['userID_new'].map(lambda x: df_user_info_mapping.get(x, {}).get('birthday', np.nan))
amazon_dataset['gender'] = amazon_dataset['userID_new'].map(lambda x: df_user_info_mapping.get(x, {}).get('gender', np.nan))

### Item handler
item_ids_available = item_data['_id'].tolist()
unique_items = amazon_dataset['item'].unique()
# print(unique_items.size)
# print(unique_users.size)
item_mapping = {item: np.random.choice(item_ids_available) for item in unique_items}
amazon_dataset['itemID_new'] = amazon_dataset['item'].map(item_mapping)

# Kết hợp thông tin bổ sung cho itemID
df_item_info_mapping = item_data.set_index('_id').to_dict(orient='index')
amazon_dataset['itemName'] = amazon_dataset['itemID_new'].map(lambda x: df_item_info_mapping.get(x, {}).get('name', np.nan))
amazon_dataset['itemPrice'] = amazon_dataset['itemID_new'].map(lambda x: df_item_info_mapping.get(x, {}).get('price', np.nan))
amazon_dataset['categoryID'] = amazon_dataset['itemID_new'].map(lambda x: df_item_info_mapping.get(x, {}).get('category_id', np.nan))

df_result = amazon_dataset[['userID_new', 'birthday', 'gender', 'itemID_new', 'itemName', 'itemPrice', 'categoryID', 'rating', 'timestamp']] #Choose estential columns
df_result.columns = ['userID', 'birthday', 'gender', 'itemID', 'itemName', 'itemPrice', 'categoryID', 'rating', 'timestamp']  # Đổi tên cột cho rõ ràng
df_result.to_csv('dataset.csv', index=False, encoding='utf-8-sig')


60
1988


### Drag

In [51]:
### Export data from database
# db.getCollection('menus').find(
#   {},
#   { _id: 1, category_id: 1, name: 1, price: 1 }
# );

### Old random totally dataset

# def generate_random_timestamp(start_date, end_date):
#     start_timestamp = int(start_date.timestamp())
#     end_timestamp = int(end_date.timestamp())
#     random_timestamp = random.randint(start_timestamp, end_timestamp)
#     return datetime.fromtimestamp(random_timestamp)

# Create sample data
# data = []
# for _ in range(num_users * 40):
#     userId = random.choice(users)
#     gender = random.choice(['Male', 'Female'])
#     birth_date = generate_random_date_of_birth(1970, 2015)
#     item = random.choice(items)
#     quantity = random.randint(1, 5)
#     timestamp = generate_random_timestamp(datetime(2023, 1, 1), datetime(2024, 7, 20))
#     data.append({
#         'userID': userId,
#         'itemID': item['_id'],
#         'gender': gender,
#         'birthday': birth_date,
#         'itemName': item['name'],
#         'itemPrice': item['price'],
#         'rating': quantity,
#         'categoryID': item['category_id'],
#         'timestamp': timestamp
#     })

# # Create DataFrame and save to CSV file
# df = pd.DataFrame(data)
# df.to_csv('dataset.csv', index=False, encoding='utf-8-sig')

# print("Dataset created and saved to csv file")