In [5]:
import csv
import random

# Define platforms and content types
platforms = ["Instagram", "TikTok", "Facebook", "Twitter"]
hashtags = ["Fitness", "Music", "Comedy", "Education", "Tech", "Gaming", "Challenge", "Beauty"]
content_types = {
    "Instagram": ["Video"] * 7 + ["Photo", "Carrousel"],
    "TikTok": ["Video"] * 9 + ["Photo"],
    "Facebook": ["Video", "Photo", "Carrousel", "Text"],
    "Twitter": ["Video", "Photo", "Carrousel", "Text"]
}

# Function to calculate engagement rate
def calculate_engagement_rate(likes, shares, comments, views):
    """
    Calculate engagement rate as a percentage of total engagement (likes, shares, comments) 
    divided by total views.
    """
    return (likes + shares + comments) / views

# Function to generate social media post data
def generate_post_data(platform):
    """
    Generate a single social media post with constraints based on platform and requirements.
    """
    # Select content type based on platform constraints
    content_type = random.choice(content_types[platform])
    
    # Generate views first
    views = random.randint(1000, 100000)
    
    # Ensure impressions are always larger than views
    impressions = random.randint(views + 1, 180000)
    
    # Generate comments and shares within view limits
    comments = random.randint(14, min(11000, views))
    shares = random.randint(6, min(20000, views))
    
    # Ensure likes are higher than comments and shares, and less than 30,000
    likes = max(comments, shares) + random.randint(1, min(30000 - max(comments, shares), views))
    
    # Calculate engagement rate
    engagement_rate = calculate_engagement_rate(likes, shares, comments, views)
    
    # Regenerate if engagement rate is outside acceptable range
    attempts = 0
    while not (0.06 <= engagement_rate <= 0.26) and attempts < 50:
        views = random.randint(1000, 100000)
        impressions = random.randint(views + 1, 180000)
        comments = random.randint(14, min(11000, views))
        shares = random.randint(6, min(20000, views))
        likes = max(comments, shares) + random.randint(1, min(30000 - max(comments, shares), views))
        engagement_rate = calculate_engagement_rate(likes, shares, comments, views)
        attempts += 1
    
    return {
        "Platform": platform,
        "Hashtag": random.choice(hashtags),
        "Content_Type": content_type,
        "Impressions": impressions,
        "Views": views,
        "Likes": likes,
        "Shares": shares,
        "Comments": comments,
        "Engagement_Rate": round(engagement_rate, 4)
    }

# Generate and write CSV
def generate_social_media_dataset(num_rows=12583):
    """
    Generate a CSV file with social media post data.
    """
    with open('social_media_dataset.csv', mode='w', newline='', encoding='utf-8') as file:
        fieldnames = ["Platform", "Hashtag", "Content_Type", "Impressions", "Views", "Likes", "Shares", "Comments", "Engagement_Rate"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Generate rows
        for _ in range(num_rows):
            platform = random.choice(platforms)
            post_data = generate_post_data(platform)
            writer.writerow(post_data)
    
    print(f"Generated {num_rows} rows in social_media_dataset.csv")

# Run the data generation
if __name__ == "__main__":
    generate_social_media_dataset()

Generated 12583 rows in social_media_dataset.csv
