In [1]:
import os
import csv
import random
from datetime import datetime, timedelta

# Configuration
START_DATE = datetime(2023, 9, 1)  # Starting date for logs
DAYS = 7  # Number of days of logs
LOGS_PER_DAY = 30  # Number of log entries per day
RAW_DATA_DIR = "raw_data"

# Possible values for random selection
ACTIONS = ["play", "pause", "skip", "forward"]
DEVICES = ["mobile", "desktop", "tablet"]
REGIONS = ["US", "EU", "APAC"]

# Content Metadata
CONTENT_METADATA = [
    {"content_id": 1000, "title": "Summer Vibes", "category": "Pop", "length": 180, "artist": "DJ Alpha"},
    {"content_id": 1001, "title": "Rock Anthem", "category": "Rock", "length": 240, "artist": "The Beats"},
    {"content_id": 1002, "title": "News Update", "category": "News", "length": 300, "artist": "Daily News"},
    {"content_id": 1003, "title": "Jazz Classics", "category": "Jazz", "length": 200, "artist": "Smooth Sounds"},
    {"content_id": 1004, "title": "Podcast Ep.1", "category": "Podcast", "length": 600, "artist": "Tech Talk"},
]

def generate_user_logs(date):
    """Generate user activity logs for a specific date."""
    log_entries = []
    for _ in range(LOGS_PER_DAY):
        user_id = random.randint(100, 200)
        content_id = random.choice([c["content_id"] for c in CONTENT_METADATA])
        action = random.choice(ACTIONS)
        timestamp = date + timedelta(seconds=random.randint(0, 86400))
        device = random.choice(DEVICES)
        region = random.choice(REGIONS)
        session_id = f"S{random.randint(1000, 9999)}"

        log_entries.append([user_id, content_id, action, timestamp.strftime("%Y-%m-%d %H:%M:%S"), device, region, session_id])

    return log_entries

def save_csv(file_path, data, headers):
    """Save data to a CSV file."""
    with open(file_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(data)

def main():
    os.makedirs(RAW_DATA_DIR, exist_ok=True)

    # Generate user activity logs for each day
    for i in range(DAYS):
        date = START_DATE + timedelta(days=i)
        folder_path = os.path.join(RAW_DATA_DIR, date.strftime("%Y-%m-%d"))
        os.makedirs(folder_path, exist_ok=True)

        log_file = os.path.join(folder_path, "user_logs.csv")
        logs = generate_user_logs(date)
        save_csv(log_file, logs, ["user_id", "content_id", "action", "timestamp", "device", "region", "session_id"])
        print(f"Generated logs: {log_file}")

    # Save content metadata (static)
    metadata_file = os.path.join(RAW_DATA_DIR, "content_metadata.csv")
    save_csv(metadata_file, [list(d.values()) for d in CONTENT_METADATA], ["content_id", "title", "category", "length", "artist"])
    print(f"Generated content metadata: {metadata_file}")

if __name__ == "__main__":
    main()


Generated logs: raw_data/2023-09-01/user_logs.csv
Generated logs: raw_data/2023-09-02/user_logs.csv
Generated logs: raw_data/2023-09-03/user_logs.csv
Generated logs: raw_data/2023-09-04/user_logs.csv
Generated logs: raw_data/2023-09-05/user_logs.csv
Generated logs: raw_data/2023-09-06/user_logs.csv
Generated logs: raw_data/2023-09-07/user_logs.csv
Generated content metadata: raw_data/content_metadata.csv


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import csv
import random
from datetime import datetime, timedelta

# Path to Google Drive (Change this if needed)
DRIVE_PATH = "/content/drive/MyDrive/Data_Storage_Assignment"
os.makedirs(DRIVE_PATH, exist_ok=True)

# Configuration
START_DATE = datetime(2023, 9, 1)  # Starting date for logs
DAYS = 7  # Number of days of logs
LOGS_PER_DAY = 30  # Number of log entries per day

# Possible values for random selection
ACTIONS = ["play", "pause", "skip", "forward"]
DEVICES = ["mobile", "desktop", "tablet"]
REGIONS = ["US", "EU", "APAC"]

# Content Metadata
CONTENT_METADATA = [
    {"content_id": 1000, "title": "Summer Vibes", "category": "Pop", "length": 180, "artist": "DJ Alpha"},
    {"content_id": 1001, "title": "Rock Anthem", "category": "Rock", "length": 240, "artist": "The Beats"},
    {"content_id": 1002, "title": "News Update", "category": "News", "length": 300, "artist": "Daily News"},
    {"content_id": 1003, "title": "Jazz Classics", "category": "Jazz", "length": 200, "artist": "Smooth Sounds"},
    {"content_id": 1004, "title": "Podcast Ep.1", "category": "Podcast", "length": 600, "artist": "Tech Talk"},
]

def generate_user_logs(date):
    """Generate user activity logs for a specific date."""
    log_entries = []
    for _ in range(LOGS_PER_DAY):
        user_id = random.randint(100, 200)
        content_id = random.choice([c["content_id"] for c in CONTENT_METADATA])
        action = random.choice(ACTIONS)
        timestamp = date + timedelta(seconds=random.randint(0, 86400))
        device = random.choice(DEVICES)
        region = random.choice(REGIONS)
        session_id = f"S{random.randint(1000, 9999)}"

        log_entries.append([user_id, content_id, action, timestamp.strftime("%Y-%m-%d %H:%M:%S"), device, region, session_id])

    return log_entries

def save_csv(file_path, data, headers):
    """Save data to a CSV file."""
    with open(file_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(data)

def main():
    # Generate user activity logs for each day
    for i in range(DAYS):
        date = START_DATE + timedelta(days=i)
        folder_path = os.path.join(DRIVE_PATH, "raw_data", date.strftime("%Y-%m-%d"))
        os.makedirs(folder_path, exist_ok=True)

        log_file = os.path.join(folder_path, "user_logs.csv")
        logs = generate_user_logs(date)
        save_csv(log_file, logs, ["user_id", "content_id", "action", "timestamp", "device", "region", "session_id"])
        print(f"Generated logs: {log_file}")

    # Save content metadata (static)
    metadata_file = os.path.join(DRIVE_PATH, "content_metadata.csv")
    save_csv(metadata_file, [list(d.values()) for d in CONTENT_METADATA], ["content_id", "title", "category", "length", "artist"])
    print(f"Generated content metadata: {metadata_file}")

if __name__ == "__main__":
    main()


Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-01/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-02/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-03/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-04/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-05/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-06/user_logs.csv
Generated logs: /content/drive/MyDrive/Data_Storage_Assignment/raw_data/2023-09-07/user_logs.csv
Generated content metadata: /content/drive/MyDrive/Data_Storage_Assignment/content_metadata.csv
