In [25]:
import pandas as pd
from datetime import timedelta
pd.set_option("display.float_format", "{:.2f}".format)

In [None]:
# Load dataset
file_path = ("data/sample_accounts.parquet")
df = pd.read_parquet(file_path)

In [None]:
# Transform dataset
df_transformed = df.copy()
df_transformed["date"] = pd.to_datetime(df_transformed["date"])  # Ensure consistent timestamp format
df_transformed = df_transformed[df_transformed["subscriber_count"] > 1000]  # Filter accounts with >1000 subscribers
df_transformed["categories"] = df_transformed["categories"].str.split(";")  # Split categories by ";"

In [29]:
# Define analysis window (30 days or 1 month)
analysis_window = df_transformed["date"].max() - timedelta(days=30) # Define the analysis period (last 30 days from the most recent date in the dataset)
recent_entries = df_transformed[df_transformed["date"] > analysis_window] # Filter entries to include only those within the analysis window

In [30]:
# Sort and get first and last subscriber count
sorted_records = recent_entries.sort_values(by=["account_id", "date"]) # Sort records by account_id and date to maintain chronological order
initial_snapshot = sorted_records.drop_duplicates(subset=["account_id"], keep="first")[["account_id", "subscriber_count"]].rename(columns={"subscriber_count": "baseline_subscribers"}) # Get the first recorded subscriber count for each account in the period
latest_snapshot = sorted_records.drop_duplicates(subset=["account_id"], keep="last")[["account_id", "subscriber_count"]].rename(columns={"subscriber_count": "current_subscribers"}) # Get the last recorded subscriber count for each account in the period

In [None]:
# Merge snapshots
trend_data = initial_snapshot.merge(latest_snapshot, on="account_id", how="inner") # Merge the first and last subscriber counts to analyze growth trend
trend_data["growth_values"] = trend_data["current_subscribers"] - trend_data["baseline_subscribers"] # Calculate absolute growth in subscriber count
trend_data["growth_percentage"] = ((trend_data["growth_values"] / trend_data["baseline_subscribers"]) * 100).round(2) # Calculate percentage growth relative to the initial subscriber count

In [39]:
# Filter accounts that experienced more than 10% growth in the last month
high_growth_accounts = trend_data[trend_data["growth_percentage"] > 10.]
high_growth_accounts

Unnamed: 0,account_id,baseline_subscribers,current_subscribers,growth_values,growth_percentage
6,00374295-8694-4f4f-b73a-5d018cffd913,34405012.98,130913079.85,96508066.87,280.51
7,003bfc0f-02d3-44d3-b666-6311b6d9424b,8040.79,11666.82,3626.03,45.10
9,00450035-c58c-4dd9-86eb-9770729c4b8f,5975.76,21444.47,15468.71,258.86
11,005cf49d-1237-4124-95c3-4985dbb82d90,501202.13,605437.88,104235.75,20.80
14,006a7383-4546-4c7b-883c-a7a549995ce1,15112860.24,135498800.31,120385940.07,796.58
...,...,...,...,...,...
9861,ff8ec89e-5fba-47c5-a7ff-b70e5b610d77,271642.32,724515.40,452873.07,166.72
9866,ffacd0ac-708e-4ec3-8471-c1e385de92c1,2632.26,4931.42,2299.16,87.35
9868,ffb075ff-e587-418b-8639-920bf2948be4,4850068.67,8259765.93,3409697.26,70.30
9871,ffbb5cff-df6b-4f18-bfeb-34365a42f887,1684719.87,2823061.91,1138342.04,67.57
