In [1]:

import pandas as pd

# Load data
df = pd.read_csv("data/users_combined.csv", sep=";", dtype=str)

# Columns to analyze
cols = [
    "Title_Text",
    "Title_PictogramId",
    "Location_Text",
    "Location_PictogramId",
    "Description_Text"
]

# Check whether the colums exist
missing = [c for c in cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns in CSV: {missing}")

# Analysis function for I) all entries, II) entries from Google Calendar, or III) entries from Independo app
def analyze_df(subdf):
    # Total amount of rows
    total_rows = len(subdf)
    print(f"Total rows: {total_rows}\n")
    
    # Iterate each column to count empty values and value frequencies
    for col in cols:
        series = subdf[col].fillna("").astype(str)
        empty_mask = series.str.strip() == ""
        empty_count = empty_mask.sum()
        nonempty_count = total_rows - empty_count
    
        print(f"Column: {col}")
        print(f"    Empty or missing: {empty_count} ({empty_count/total_rows:.1%})")
        print(f"    Non-empty: {nonempty_count} ({nonempty_count/total_rows:.1%})")
    
        # Calculate frequencies
        freqs = series[~empty_mask].value_counts()
        if freqs.empty:
            print("    No non-empty entries")
        else:
            print("    Top 10 most frequent values:")
            for val, cnt in freqs.head(10).items():
                print(f"    xxx: {cnt}")
    print("\n")

print("Analyzing whole dataset")
analyze_df(df)

print("Analyzing data created in Independo app (CalendarOrigin = 0)")
analyze_df(df[df["CalendarOrigin"] == "0"])

print("Analyzing data created in Google Calendar (CalendarOrigin = 1)")
analyze_df(df[df["CalendarOrigin"] == "1"])


Analyzing whole dataset
Total rows: 12320

Column: Title_Text
    Empty or missing: 0 (0.0%)
    Non-empty: 12320 (100.0%)
    Top 10 most frequent values:
    xxx: 2268
    xxx: 2214
    xxx: 1475
    xxx: 1061
    xxx: 993
    xxx: 989
    xxx: 753
    xxx: 729
    xxx: 265
    xxx: 260
Column: Title_PictogramId
    Empty or missing: 3 (0.0%)
    Non-empty: 12317 (100.0%)
    Top 10 most frequent values:
    xxx: 2269
    xxx: 2214
    xxx: 1475
    xxx: 1460
    xxx: 804
    xxx: 753
    xxx: 731
    xxx: 519
    xxx: 265
    xxx: 263
Column: Location_Text
    Empty or missing: 9697 (78.7%)
    Non-empty: 2623 (21.3%)
    Top 10 most frequent values:
    xxx: 1719
    xxx: 525
    xxx: 260
    xxx: 108
    xxx: 2
    xxx: 1
    xxx: 1
    xxx: 1
    xxx: 1
    xxx: 1
Column: Location_PictogramId
    Empty or missing: 9957 (80.8%)
    Non-empty: 2363 (19.2%)
    Top 10 most frequent values:
    xxx: 1719
    xxx: 528
    xxx: 108
    xxx: 2
    xxx: 1
    xxx: 1
    xxx: 1
    xxx: 1