In [2]:
import polars as pl
import pandas as pd
import os
import csv
from google.colab import drive
import math
from collections import Counter, defaultdict
import os

In [4]:
drive.mount('/content/drive')

fb_posts = '/content/drive/My Drive/Presidential Election Data/2024_fb_posts_president_scored_anon.csv'
fb_ads = '/content/drive/My Drive/Presidential Election Data/2024_fb_ads_president_scored_anon.csv'
tw_posts = '/content/drive/My Drive/Presidential Election Data/2024_tw_posts_president_scored_anon.csv'

Mounted at /content/drive


In [13]:
def analyze_polars_dataset(df: pl.DataFrame):
    print("===== General Dataset Summary =====\n")

    for col in df.columns:
        col_data = df[col]
        print(f"Column: {col}")
        non_null = col_data.drop_nulls()

        if col_data.dtype.is_numeric():
            print(f"  Count: {non_null.len()}")
            print(f"  Mean: {non_null.mean()}")
            print(f"  Min: {non_null.min()}")
            print(f"  Max: {non_null.max()}")
            print(f"  Stddev: {non_null.std()}")
        else:
            unique_vals = non_null.unique()
            print(f"  Count: {non_null.len()}")
            print(f"  Unique values: {unique_vals.len()}")

            if not non_null.is_empty():
                freq_df = non_null.value_counts().sort("count", descending=True)
                top_val = freq_df[0][col].item()       # get raw string
                top_count = freq_df[0]["count"].item() # get count as int
                print(f"  Most frequent: {top_val} (appears {top_count} times)")

        print()

In [14]:
df_ads = pl.read_csv(fb_ads)

analyze_polars_dataset(df_ads)

===== General Dataset Summary =====

Column: page_id
  Count: 246745
  Unique values: 4475
  Most frequent: 4d66f5853f0365dba032a87704a634f023d15babde973bb7a284ed8cd2707b2d (appears 55503 times)

Column: ad_id
  Count: 246745
  Unique values: 246745
  Most frequent: 1cec37d522e22202167e2ead1da7d5f9f8aa0572eb0d6fd4b0abd00442c15494 (appears 1 times)

Column: ad_creation_time
  Count: 246745
  Unique values: 547
  Most frequent: 2024-10-27 (appears 8619 times)

Column: bylines
  Count: 245736
  Unique values: 3790
  Most frequent: HARRIS FOR PRESIDENT (appears 49788 times)

Column: currency
  Count: 246745
  Unique values: 18
  Most frequent: USD (appears 246599 times)

Column: delivery_by_region
  Count: 246745
  Unique values: 141122
  Most frequent: {} (appears 30989 times)

Column: demographic_distribution
  Count: 246745
  Unique values: 215622
  Most frequent: {} (appears 30989 times)

Column: estimated_audience_size
  Count: 246745
  Mean: 556462.8559687126
  Min: 0
  Max: 1000001


In [27]:
import polars as pl

def analyze_polars_grouped(df: pl.DataFrame, group_cols: list[str]):
    print(f"\n===== Grouped Analysis by {group_cols} =====\n")

    # Get a list of other columns (non-grouping)
    other_cols = [col for col in df.columns if col not in group_cols]

    # Group the dataframe
    grouped = df.group_by(group_cols)

    # Loop over each group - limit to first 3
    for i, (group_key, group_df) in enumerate(grouped):
        if i >= 3:
            break

        print(f"Group: {group_key}")

        for col in other_cols:
            col_data = group_df[col].drop_nulls()

            print(f"  Column: {col}")
            if col_data.is_empty():
                print("     No data")
                continue

            if col_data.dtype.is_numeric():
                print(f"    Count: {col_data.len()}")
                print(f"    Mean: {col_data.mean()}")
                print(f"    Min: {col_data.min()}")
                print(f"    Max: {col_data.max()}")
                print(f"    Stddev: {col_data.std()}")
            else:
                unique_vals = col_data.unique()
                print(f"    Count: {col_data.len()}")
                print(f"    Unique values: {unique_vals.len()}")
                # Get top value cleanly
                freq_df = col_data.value_counts().sort("count", descending=True)
                top_val = freq_df[0][col].item()
                top_count = freq_df[0]["count"].item()
                print(f"    Most frequent: {top_val} (appears {top_count} times)")
        print("-" * 60)

In [26]:
analyze_polars_grouped(df_ads, ['page_id'])
analyze_polars_grouped(df_ads, ['page_id', 'ad_id'])


===== Grouped Analysis by ['page_id'] =====

Group: ('c26f0376627243618aee5b03d8ff810687884c865b6ed8d0932e8d04d76cad3b',)
  Column: ad_id
    Count: 4
    Unique values: 4
    Most frequent: d42837ff3c614804eda983270e1e75448ced427a833340831ea70ed9dfc100c8 (appears 1 times)
  Column: ad_creation_time
    Count: 4
    Unique values: 2
    Most frequent: 2024-08-20 (appears 3 times)
  Column: bylines
    Count: 4
    Unique values: 1
    Most frequent: JASMINE FOR US (appears 4 times)
  Column: currency
    Count: 4
    Unique values: 1
    Most frequent: USD (appears 4 times)
  Column: delivery_by_region
    Count: 4
    Unique values: 4
    Most frequent: {'Alabama': {'spend': 1, 'impressions': 77}, 'South Carolina': {'spend': 2, 'impressions': 100}, 'New Jersey': {'spend': 3, 'impressions': 182}, 'New Mexico': {'spend': 1, 'impressions': 73}, 'New York': {'spend': 7, 'impressions': 365}, 'North Carolina': {'spend': 4, 'impressions': 246}, 'North Dakota': {'spend': 0, 'impressions': 16