# Viewing RDS (OPTIONAL)

In [None]:
!pip -q install pyreadr
import glob
import pandas as pd
import pyreadr

In [None]:
rds_path = glob.glob("/content/*.rds")[0]
print("Reading:", rds_path)

res = pyreadr.read_r(rds_path)
df = next(iter(res.values()))

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)

print("shape:", df.shape)
display(df.head())

# DATA STACKING

In [None]:
import pandas as pd
import numpy as np
import re
import os
import pyreadr
from google.colab import files

In [None]:
REPO_DIR = "/content/"
# main csv that will store the collected data
MASTER_OUTPUT = f"{REPO_DIR}/standardized_master.csv"

In [None]:
# This assumes metadata.csv is in the current working directory.
# If not, upload it once at the correct path.

METADATA_NAME = "/content/metadata.csv"

master_df = pd.read_csv(METADATA_NAME)
master_df.columns = [c.strip() for c in master_df.columns]
master_df = master_df.loc[:, ~master_df.columns.str.contains(r"^Unnamed")]
master_columns = list(master_df.columns)
print("Loaded metadata sheet")
print("Number of master columns:", len(master_columns))
# print("First few columns:", master_columns[:10])

In [None]:
# FIND DATA FILE - CSV VERSION
import os

csv_files = [
    os.path.join("/content", f)
    for f in os.listdir("/content")
    if f.lower().endswith(".csv")
]

data_files = [
    f for f in csv_files
    if os.path.basename(f) != "metadata.csv"
    and not os.path.basename(f).startswith("standardized_")
]

print("CSV files found in /content:", [os.path.basename(f) for f in csv_files])

if not data_files:
    raise RuntimeError("No data CSV found (only metadata.csv and/or standardized_*.csv). Drag a raw data file in.")

data_fn = data_files
print("Using data file:", data_fn)

df_raw = pd.read_csv(
    data_fn,
    engine="python",
    on_bad_lines="warn"
)
print(f"\n Rows: {len(df_raw)}")
print(f" Columns ({len(df_raw.columns)}):")
print(list(df_raw.columns))

In [None]:
# # FIND DATA FILE - RDS VERSION

# import os

# rds_files = [
#     os.path.join("/content", f)
#     for f in os.listdir("/content")
#     if f.lower().endswith(".rds")
# ]
# print("RDS files found in /content:", [os.path.basename(f) for f in rds_files])

# data_files = [
#     f for f in rds_files
#     if os.path.basename(f) != "metadata.  csv"
#     and not os.path.basename(f).startswith("standardized_")
# ]

# if not data_files:
#     raise RuntimeError("No data RDS found (only metadata.csv and/or standardized_*.rds). Upload a raw .rds file in /content.")

# data_fn = data_files
# print("Using data file:", data_fn)

# result = pyreadr.read_r(data_fn)
# print("Objects inside RDS:", list(result.keys()))
# df_raw = next(iter(result.values()))

# print(f"\n Rows: {len(df_raw)}")
# print(f" Columns ({len(df_raw.columns)}):")
# print(list(df_raw.columns))

In [None]:
def normalize_name(name: str) -> str:
    if name is None:
        return ""
    s = str(name).strip()
    s = re.sub(r"\s+", " ", s)
    return s.lower()

In [None]:
def standardize_df(df_raw: pd.DataFrame, master_cols: list, meta: dict) -> pd.DataFrame:
    raw_cols = list(df_raw.columns)
    norm_raw_map = {normalize_name(c): c for c in raw_cols}
    norm_master_map = {normalize_name(c): c for c in master_cols}

    out_df = pd.DataFrame(index=df_raw.index, columns=master_cols)

    for col, value in meta.items():
        if col in out_df.columns:
            out_df[col] = value
        else:
            print(f"Meta column '{col}' not in master, skipping.")

    unmatched_raw = []
    matched_pairs = []

    for norm_raw, raw_col in norm_raw_map.items():
        if norm_raw in norm_master_map:
            master_col = norm_master_map[norm_raw]
            out_df[master_col] = df_raw[raw_col]
            matched_pairs.append((raw_col, master_col))
        else:
            unmatched_raw.append(raw_col)

    print("Matched columns (raw â†’ master):")
    for r, m in matched_pairs:
        print(f"  {r} â†’ {m}")

    if unmatched_raw:
        print("\nRaw columns with NO match in master header:")
        for c in unmatched_raw:
            print(" ", c)

    return out_df

In [None]:
def random_consistency_check(df_raw, df_std, n_samples=25):

    raw_cols = list(df_raw.columns)
    std_cols = list(df_std.columns)

    norm_raw = {normalize_name(c): c for c in raw_cols}
    norm_std = {normalize_name(c): c for c in std_cols}

    common_norm = [c for c in norm_raw.keys() if c in norm_std.keys()]

    print(f"Raw columns: {len(raw_cols)}")
    print(f"Std columns: {len(std_cols)}")
    print(f"Overlapping columns to compare: {len(common_norm)}")

    if len(common_norm) == 0:
        print("No overlapping columns to compare. Check your schema.")
        return

    raw_overlap_cols = [norm_raw[n] for n in common_norm]
    std_overlap_cols = [norm_std[n] for n in common_norm]

    n = min(n_samples, len(df_raw))
    sample_idx = np.random.choice(len(df_raw), size=n, replace=False)

    mismatches_total = 0

    for idx in sample_idx:
        print("\n" + "=" * 80)
        print(f"Row index {idx}")

        row_mismatches = 0

        raw_row_overlap = df_raw.iloc[idx][raw_overlap_cols]
        std_row_overlap = df_std.iloc[idx][std_overlap_cols]

        print("\nRAW (overlapping columns):")
        print(raw_row_overlap.to_dict())

        print("\nSTD (overlapping columns):")
        print(std_row_overlap.to_dict())

        for norm_name in common_norm:
            r_col = norm_raw[norm_name]
            s_col = norm_std[norm_name]

            v_raw = df_raw.iloc[idx][r_col]
            v_std = df_std.iloc[idx][s_col]

            both_nan = pd.isna(v_raw) and pd.isna(v_std)

            if not both_nan and v_raw != v_std:
                if row_mismatches == 0:
                    print("\n  Mismatches in this row:")
                print(f"    {r_col} (raw) = {repr(v_raw)}  |  {s_col} (std) = {repr(v_std)}")
                row_mismatches += 1
                mismatches_total += 1

        if row_mismatches == 0:
            print("\n  All overlapping columns match for this row.")

    if mismatches_total == 0:
        print("\nAll sampled rows match perfectly on all overlapping columns.")
    else:
        print(f"\nFound {mismatches_total} mismatches across {n} sampled rows.")

In [None]:
# EDIT THIS LINE FOR EACH NEW CSV/ RDS FILE THATS ADDED TO THE MASTER
metadata_line = 'covid_19	simulation	predictions				ols_prediction_results.rds'
parts = metadata_line.strip().split("\t")

if len(parts) != 7:
    raise ValueError(f"Expected 7 tab-separated values, got {len(parts)}: {parts}")

meta = {
    "Main folder": parts[0],
    "Level 1":     parts[1],
    "Level 2":     parts[2],
    "Level 3":     parts[3],
    "Level 4":     parts[4],
    "Level 5":     parts[5],
    "df_name":     parts[6]
}

# --- check df_name vs actual file name ---
provided_name = parts[6].strip()
actual_name   = os.path.basename(data_fn).strip()

if provided_name != actual_name:
    raise ValueError(
        f"df_name from metadata ({provided_name}) does NOT match actual file name ({actual_name})."
    )
else:
    print(f"df_name matches file name: {provided_name}")

print("Parsed meta:")
for k, v in meta.items():
    print(f"  {k}: {v}")

In [None]:
standardized_df = standardize_df(df_raw, master_columns, meta)

if os.path.exists(MASTER_OUTPUT):
    existing = pd.read_csv(MASTER_OUTPUT, low_memory=False)
    existing.columns = [c.strip() for c in existing.columns]

    if "df_name" in existing.columns:
        before = len(existing)
        # ðŸ§¹ drop old rows for this df_name (so we don't duplicate)
        existing = existing[existing["df_name"] != provided_name]
        removed = before - len(existing)
        print(f"ðŸ§¹ Removed {removed} old rows for df_name = {provided_name}")
    else:
        print("'df_name' column not found in existing master, not dropping old rows.")

    standardized_df = standardized_df.reindex(columns=existing.columns)

    combined = pd.concat([existing, standardized_df], ignore_index=True)
    print(f"Existing master had {before} rows, now {len(combined)} rows.")
else:
    combined = standardized_df.copy()
    print("No existing master found, creating a new one.")

combined.to_csv(MASTER_OUTPUT, index=False)
print(f"Master file updated: {MASTER_OUTPUT} (now {len(combined)} rows)")

In [None]:
std_this = combined[combined["df_name"] == provided_name].copy()

df_raw_reset = df_raw.reset_index(drop=True)
std_this = std_this.reset_index(drop=True)

print("\nRaw rows:", len(df_raw_reset))
print("Std rows for this df_name:", len(std_this))

if len(df_raw_reset) != len(std_this):
    raise ValueError("Row count mismatch between raw and standardised subset!")
else:
    print("Row counts match for this file. Running random consistency check...")

random_consistency_check(df_raw_reset, std_this, n_samples=25)

In [None]:
# ---------- DELETE RAW DATA FILE (KEEP metadata.csv & MASTER) ----------
if os.path.exists(data_fn):
    os.remove(data_fn)
    print(f"\nDeleted local data file: {data_fn}")
else:
    print(f"\nTried to delete but file not found: {data_fn}")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os, shutil, datetime; os.makedirs("/content/drive/MyDrive/covid_sim_indl", exist_ok=True);
shutil.copy("/content/standardized_master.csv", f"/content/drive/MyDrive/covid_sim_indl/8_standardized_master_{datetime.datetime.now():%Y%m%d_%H%M%S}.csv")

### COMBINE AND THEN BREAK INTO GITHUB FRIENDLY CHUNKS

In [None]:
import os
import pandas as pd

src_path = "/content/standardized_master.csv"
total_bytes = os.path.getsize(src_path)
print(f"Source size: {total_bytes / (1024*1024):.2f} MB")

df_all = pd.read_csv(src_path, low_memory=False)
n_rows = len(df_all)
print(f"Total rows: {n_rows}")

target_mb = 50
max_mb    = 90

bytes_per_row = total_bytes / n_rows
rows_per_chunk_est = int((target_mb * 1024 * 1024) / bytes_per_row)
print(f"Estimated rows per ~{target_mb} MB chunk: {rows_per_chunk_est}")

out_folder = "/content/standardized_parts"
os.makedirs(out_folder, exist_ok=True)

row_start = 0
part = 1

while row_start < n_rows:
    rows_this_chunk = rows_per_chunk_est

    while True:
        row_end = min(row_start + rows_this_chunk, n_rows)
        chunk = df_all.iloc[row_start:row_end]

        out_path = os.path.join(out_folder, f"standardized_master_part_{part:02d}.csv")
        chunk.to_csv(out_path, index=False)

        size_mb = os.path.getsize(out_path) / (1024 * 1024)
        pct = (row_end / n_rows) * 100

        if size_mb <= max_mb or (row_end - row_start) <= 5000:
            print(f"ðŸ’¾ {os.path.basename(out_path)} | rows {row_start}-{row_end-1} "
                  f"({len(chunk)}) | {size_mb:.2f} MB | {pct:.2f}% of data")
            break
        else:
            # too big â†’ delete & shrink rows, retry
            os.remove(out_path)
            rows_this_chunk = int(rows_this_chunk * 0.7)  # shrink by 30%
            print(f"Chunk too big ({size_mb:.2f} MB). "
                  f"Retrying with ~{rows_this_chunk} rows...")

    row_start = row_end
    part += 1

print("\nDone! All parts are in /content/standardized_parts")