In [5]:
import csv
from google.colab import drive
import math
from collections import Counter, defaultdict
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
fb_posts = '/content/drive/My Drive/Presidential Election Data/2024_fb_posts_president_scored_anon.csv'
fb_ads = '/content/drive/My Drive/Presidential Election Data/2024_fb_ads_president_scored_anon.csv'
tw_posts = '/content/drive/My Drive/Presidential Election Data/2024_tw_posts_president_scored_anon.csv'

In [7]:
import csv
import math
from collections import Counter

def analyze_csv(filepath):
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        header = next(reader)
        columns = {col: [] for col in header}
        for row in reader:
            for col, value in zip(header, row):
                columns[col].append(value)

    filename = os.path.basename(filepath)
    print(f'----- Analysis for {filename} -----\n')
    all_numeric = []
    for col, values in columns.items():
        def is_number(s):
            try:
                float(s)
                return True
            except ValueError:
                return False

        numeric_values = [float(v) for v in values if is_number(v)]
        print(f'Column: {col}')
        print(f'  Count: {len(values)}')
        if numeric_values:
            mean = sum(numeric_values) / len(numeric_values)
            min_val = min(numeric_values)
            max_val = max(numeric_values)
            stddev = math.sqrt(sum((x - mean) ** 2 for x in numeric_values) / len(numeric_values))
            print(f'  Mean: {mean}')
            print(f'  Min: {min_val}')
            print(f'  Max: {max_val}')
            print(f'  Stddev: {stddev}')
            all_numeric.extend(numeric_values)
        else:
            counter = Counter(values)
            print(f'  Unique values: {len(counter)}')
            most_common = counter.most_common(1)
            if most_common:
                print(f'  Most frequent: {most_common[0][0]} (appears {most_common[0][1]} times)')
        print()

    if all_numeric:
        overall_count = len(all_numeric)
        overall_mean = sum(all_numeric) / overall_count
        overall_min = min(all_numeric)
        overall_max = max(all_numeric)
        overall_stddev = math.sqrt(sum((x - overall_mean) ** 2 for x in all_numeric) / overall_count)
        print('Overall Dataset Analysis:')
        print(f'  Count: {overall_count}')
        print(f'  Mean: {overall_mean}')
        print(f'  Min: {overall_min}')
        print(f'  Max: {overall_max}')
        print(f'  Stddev: {overall_stddev}')
    print('-' * 80, '\n')

In [27]:
analyze_csv(fb_posts)
analyze_csv(fb_ads)
analyze_csv(tw_posts)

----- Analysis for 2024_fb_posts_president_scored_anon.csv -----

Column: Facebook_Id
  Count: 19009
  Unique values: 21
  Most frequent: 32fc18da91029ff09bf74fe9887eace6b5d2145809d583f696e344530508b064 (appears 9013 times)

Column: post_id
  Count: 19009
  Unique values: 19009
  Most frequent: 8570b69695e00d8f06b12398ed525497e1712b5369c6fc2138fe98f69811c138 (appears 1 times)

Column: Page Category
  Count: 19009
  Unique values: 7
  Most frequent: PERSON (appears 9453 times)

Column: Page Admin Top Country
  Count: 19009
  Unique values: 2
  Most frequent: US (appears 16280 times)

Column: Post Created
  Count: 19009
  Unique values: 18951
  Most frequent: 2023-11-14 11:11:44 EST (appears 2 times)

Column: Post Created Date
  Count: 19009
  Unique values: 425
  Most frequent: 2024-10-31 (appears 103 times)

Column: Post Created Time
  Count: 19009
  Unique values: 16102
  Most frequent: 19:42:00 (appears 7 times)

Column: Type
  Count: 19009
  Unique values: 10
  Most frequent: Link (

## After Grouping

In [26]:
def analyze_grouped_ads(filepath, group_cols):
    with open(filepath, newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        groups = defaultdict(list)
        for row in reader:
            # Create group key using selected grouping columns
            key = tuple(row[col].strip() for col in group_cols)
            groups[key].append(row)

    filename = os.path.basename(filepath)
    print(f'\n--- Aggregated Analysis by {group_cols} for {filename} ---')

    for i, (group_key, rows) in enumerate(groups.items()):
        if i >= 3:
            break  # Limit output to first 3 groups only

        print(f'\nGroup: {group_key}')
        columns = defaultdict(list)

        # Accumulate all values for each column in the group
        for row in rows:
            for col, val in row.items():
                columns[col].append(val)

        for col, values in columns.items():
            cleaned_values = [v.strip() for v in values if v.strip() != '']
            numeric_vals = [float(v) for v in cleaned_values if is_number(v)]

            print(f'  Column: {col}')
            print(f'    Count: {len(cleaned_values)}')

            if numeric_vals:
                mean = sum(numeric_vals) / len(numeric_vals)
                min_val = min(numeric_vals)
                max_val = max(numeric_vals)
                stddev = (
                    math.sqrt(sum((x - mean) ** 2 for x in numeric_vals) / (len(numeric_vals) - 1))
                    if len(numeric_vals) > 1 else 0.0
                )
                print(f'    Mean: {mean}')
                print(f'    Min: {min_val}')
                print(f'    Max: {max_val}')
                print(f'    Stddev: {stddev}')
            else:
                counter = Counter(cleaned_values)
                print(f'    Unique values: {len(counter)}')
                if counter:
                    most_common = counter.most_common(1)[0]
                    print(f'    Most frequent: {most_common[0]} (appears {most_common[1]} times)')
        print('-' * 50)


In [25]:
analyze_grouped_ads(fb_ads, ['page_id'])
analyze_grouped_ads(fb_ads, ['page_id', 'ad_id'])


--- Aggregated Analysis by ['page_id'] for 2024_fb_ads_president_scored_anon.csv ---

Group: ('4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230',)
  Column: page_id
    Count: 33
    Unique values: 1
    Most frequent: 4ff23a48b53d988df50ddfebb0e442a984ab8f94e874ef9b9cb34394e0c5d230 (appears 33 times)
  Column: ad_id
    Count: 33
    Unique values: 33
    Most frequent: 0ddb025b8544e2d58e6977ad417e742a52522b3e1fc1c9d9b61c57148f8d72fc (appears 1 times)
  Column: ad_creation_time
    Count: 33
    Unique values: 5
    Most frequent: 2024-10-13 (appears 13 times)
  Column: bylines
    Count: 33
    Unique values: 1
    Most frequent: Texas Organizing Project PAC (appears 33 times)
  Column: currency
    Count: 33
    Unique values: 1
    Most frequent: USD (appears 33 times)
  Column: delivery_by_region
    Count: 33
    Unique values: 21
    Most frequent: {'Texas': {'spend': 149, 'impressions': 17499}} (appears 4 times)
  Column: demographic_distribution
    Count: 33
