<a href="https://colab.research.google.com/github/dansarmiento/analytics_projects/blob/main/Netflix_Games_Analytics_Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This notebook fabricates the dataset I would use for creating the game analytics dashboard.  This dataset includes:**

* A game sessions table with assumptions about playtime hours
* A player demographics table for a small list of southern california cities


**I also model some visualizations for the data using plotly express before creating the dashboard in Tableau**







In [1]:
# Code in this notebook is a fabricates game session and player data for a game currently promoted on Netflix
# This will create example datasets that can be fed into Tableau Public for sharing
# There are also some example visualizations using plotly to evaluate the data before working it up in Tableau

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import plotly.express as px

In [2]:
# Create game sessions data with playtime weighted for afternoons, evenings, and night for a date range between 11/1/2024 and 1/31/2025

# Probabilities for game session hour distribution
hour_distribution = {
    (19, 24): 0.60,  # 60% chance for late night sessions
    (1, 6): 0.05,    # 5% chance for very late night sessions
    (7, 12): 0.10,   # 10% chance for morning/lunch sessions
    (13, 18): 0.25   # 25% chance for afternoon/early evening sessions
}

# Function to sample an hour based on defined probabilities
def get_random_hour():
    bucket = random.choices(list(hour_distribution.keys()), weights=hour_distribution.values(), k=1)[0]
    return random.randint(bucket[0], bucket[1])

start_date = datetime(2024, 11, 1)
num_days = 90

# List to store session records
session_data = []

# Dictionary to store player first game dates
player_first_games = {}

# Generate Player IDs incrementally
player_id_counter = 1

# Simulating player behavior over 90 days
for day in range(num_days):
    current_date = start_date + timedelta(days=day)

    # Players who play their first game on this day
    new_players = np.random.randint(1000, 5000)

    # Assign first game dates to new players
    for _ in range(new_players):
        player_first_games[player_id_counter] = current_date
        player_id_counter += 1

    # Ensure we do not sample more players than exist
    total_players = len(player_first_games)
    active_player_count = min(np.random.randint(3000, 7000), total_players)

    # Create logins from available players
    active_players = np.random.choice(list(player_first_games.keys()), size=active_player_count, replace=False)

    for player_id in active_players:
        first_game_date = player_first_games[player_id]

        # Generate session date with randomized hour
        session_hour = get_random_hour()
        session_datetime = current_date + timedelta(hours=session_hour, minutes=random.randint(0, 59))

        # Simulate session duration (5 to 30 minutes)
        session_duration = np.random.randint(5, 30)

        # Append session data
        session_data.append([player_id, first_game_date, session_datetime, session_duration])

# DataFrame for row-level session data
session_df = pd.DataFrame(session_data, columns=[
    "Player_ID", "First_Game_Date", "Session_DateTime", "Session_Duration"
])

# Convert date columns to datetime format
session_df["First_Game_Date"] = pd.to_datetime(session_df["First_Game_Date"])
session_df["Session_DateTime"] = pd.to_datetime(session_df["Session_DateTime"])

# Compute days since first game for each session to facilitate later calculations
session_df["Days_Since_First_Game"] = (session_df["Session_DateTime"] - session_df["First_Game_Date"]).dt.days




In [3]:
 # Compute retention metrics which is not part of the Tableau export, it is used to evaluate metrics in this notebook

retention_df = session_df.groupby("Player_ID")["Days_Since_First_Game"].max().reset_index()
retention_df["Retention_Day_1"] = (retention_df["Days_Since_First_Game"] >= 1).astype(int)
retention_df["Retention_Day_7"] = (retention_df["Days_Since_First_Game"] >= 7).astype(int)
retention_df["Retention_Day_30"] = (retention_df["Days_Since_First_Game"] >= 30).astype(int)

# Making a churn status where a player is churned if they don't play in 30 days
retention_df["Churn_Status"] = (retention_df["Days_Since_First_Game"] < 30).astype(int)

# Probability for player churn will be based on their activity relative to the population by using the median
player_session_counts = session_df.groupby("Player_ID")["Session_DateTime"].count().reset_index()
player_session_counts.columns = ["Player_ID", "Sessions_in_30_Days"]
pop_median_sessions = player_session_counts["Sessions_in_30_Days"].median() if not player_session_counts.empty else 1

# If a player has more sessions than half the active population, they're more likely to continue.
player_session_counts["Churn_Probability"] = 1 - (player_session_counts["Sessions_in_30_Days"] / pop_median_sessions)
player_session_counts["Churn_Probability"] = player_session_counts["Churn_Probability"].clip(0, 1)  # Ensure values are between 0 and 1

# Merge churn probability back to retention dataframe
retention_df = retention_df.merge(player_session_counts[["Player_ID", "Churn_Probability"]], on="Player_ID", how="left")

# Ensure NaN values in churn probability are replaced
retention_df["Churn_Probability"] = retention_df["Churn_Probability"].fillna(0)

In [4]:
# Making player demographics for this player population roughly based off of mobile game download statistics, this example population will live in a small sample of SoCal cities

southern_california_data = [
    {"zip_code": 90001, "city": "Los Angeles", "county": "Los Angeles", "state": "CA"},
    {"zip_code": 90802, "city": "Long Beach", "county": "Los Angeles", "state": "CA"},
    {"zip_code": 92612, "city": "Irvine", "county": "Orange", "state": "CA"},
    {"zip_code": 92101, "city": "San Diego", "county": "San Diego", "state": "CA"},
    {"zip_code": 91710, "city": "Chino", "county": "San Bernardino", "state": "CA"},
    {"zip_code": 92336, "city": "Fontana", "county": "San Bernardino", "state": "CA"},
    {"zip_code": 93003, "city": "Ventura", "county": "Ventura", "state": "CA"},
    {"zip_code": 92805, "city": "Anaheim", "county": "Orange", "state": "CA"},
    {"zip_code": 92262, "city": "Palm Springs", "county": "Riverside", "state": "CA"},
    {"zip_code": 93550, "city": "Palmdale", "county": "Los Angeles", "state": "CA"},
    {"zip_code": 91910, "city": "Chula Vista", "county": "San Diego", "state": "CA"},
    {"zip_code": 93010, "city": "Camarillo", "county": "Ventura", "state": "CA"},
    {"zip_code": 91730, "city": "Rancho Cucamonga", "county": "San Bernardino", "state": "CA"},
    {"zip_code": 93534, "city": "Lancaster", "county": "Los Angeles", "state": "CA"},
]

# Age bucket distribution
age_buckets = {
    "18-24": (18, 24, 0.60),
    "25-34": (25, 34, 0.15),
    "35-49": (35, 49, 0.10),
    "50-64": (50, 64, 0.13),
    "65+": (65, 75, 0.02),  # 75 as upper bound
}

# Extract unique player IDs from game sessions
unique_players = session_df["Player_ID"].unique()

# Generate player demographics for each player
player_demographics = []
for player_id in unique_players:
    location = random.choice(southern_california_data)
    bucket_choices = list(age_buckets.keys())
    bucket_weights = [age_buckets[b][2] for b in bucket_choices]
    selected_bucket = random.choices(bucket_choices, weights=bucket_weights, k=1)[0]
    age = random.randint(age_buckets[selected_bucket][0], age_buckets[selected_bucket][1])

    player_demographics.append([
        player_id, location["zip_code"], location["city"], location["county"], location["state"], selected_bucket, age
    ])

# Create DataFrame
player_demographics_df = pd.DataFrame(player_demographics, columns=[
    "Player_ID", "ZIP_Code", "City", "County", "State", "Age_Bucket", "Age"
])


In [5]:

# Line graph for Daily Active Users is quite noisy
dau_df = session_df.groupby(session_df["Session_DateTime"].dt.date)["Player_ID"].nunique().reset_index()
dau_df.columns = ["Session_Date", "DAU"]

fig = px.line(dau_df, x="Session_Date", y="DAU", title="Daily Active Users (DAU)",
              labels={"Session_Date": "Date", "DAU": "Daily Active Users"},
              markers=True)

fig.show()


In [6]:
# 7 day rolling average smooths out the variance so trends can be better identified

# Convert Session_Date to datetime format
dau_df["Session_Date"] = pd.to_datetime(dau_df["Session_Date"])

# 7-day rolling average
dau_df["DAU_Rolling_Avg"] = dau_df["DAU"].rolling(window=7).mean()

# Drop rows where rolling average is null for the first week
dau_df = dau_df.dropna(subset=["DAU_Rolling_Avg"])

fig = px.line(dau_df, x="Session_Date", y="DAU_Rolling_Avg",
              title="7-Day Rolling Average of Daily Active Users (DAU)",
              labels={"Session_Date": "Date", "DAU_Rolling_Avg": "7-Day Rolling Average"},
              markers=True)

fig.show()


In [7]:
# The retention rates for this fabricated data would make it an incredibly successful game

retention_trend = retention_df[["Retention_Day_7", "Retention_Day_30"]].mean().reset_index()
retention_trend.columns = ["Retention_Period", "Retention_Rate"]
retention_trend["Retention_Rate"] = retention_trend["Retention_Rate"].round(3)

fig = px.bar(retention_trend, y="Retention_Period", x="Retention_Rate", text="Retention_Rate", orientation="h",
             title="7-Day and 30-Day Retention Rates",
             labels={"Retention_Period": "Retention Period", "Retention_Rate": "Retention Rate (%)"})

fig.update_traces(textposition='outside', texttemplate='%{text}')

fig.show()


In [8]:
# If we look at a heatmap of the game sessions by weekday and hour, it shows the hourly probability distribution

session_df["Session_Hour"] = session_df["Session_DateTime"].dt.hour
session_df["Session_DayOfWeek"] = session_df["Session_DateTime"].dt.day_name()

# Aggregate session counts
heatmap_data = session_df.groupby(["Session_DayOfWeek", "Session_Hour"]).size().reset_index(name="Session_Count")

# Days of the week
days_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
heatmap_data["Session_DayOfWeek"] = pd.Categorical(heatmap_data["Session_DayOfWeek"], categories=days_order, ordered=True)


fig_heatmap = px.density_heatmap(heatmap_data, x="Session_DayOfWeek", y="Session_Hour", z="Session_Count",
                                 title="Game Session Heatmap (Days of Week vs Hours)",
                                 labels={"Session_Hour": "Hour of Day", "Session_DayOfWeek": "Day of Week", "Session_Count": "Number of Sessions"},
                                 category_orders={"Session_DayOfWeek": days_order},
                                 color_continuous_scale="Viridis")

fig_heatmap.show()
