In [12]:
import pandas as pd
import glob
import os

# Path to base file and folder of other CSVs
base_csv = '../exit_mastid.csv'
folder_path = '../data'

# Load and convert mastid to float in base
base = pd.read_csv(base_csv)
base['mastid'] = pd.to_numeric(base['mastid'], errors='coerce')  # or .astype(float)
print(f"Loaded base table from {os.path.basename(base_csv)} with shape {base.shape}")

# Get list of other CSV files in the folder (excluding the base)
csv_files = [
    f for f in glob.glob(os.path.join(folder_path, '*.csv'))
    if not os.path.samefile(f, base_csv)
]

# Iteratively merge all other CSVs on mastid
merged = base.copy()
for file in csv_files:
    try:
        df = pd.read_csv(file)
        
        # Convert mastid to float to match base
        df['mastid'] = pd.to_numeric(df['mastid'], errors='coerce')

        print(f"Merging {os.path.basename(file)} with shape {df.shape}")

        # Drop duplicate mastids if needed
        df = df.drop_duplicates(subset='mastid')

        # Perform left join
        merged = merged.merge(df, on='mastid', how='left')
    except Exception as e:
        print(f"❌ Failed on {file}: {e}")

# Save final result
final_output_path = '../final_joined_students.csv'
merged.to_csv(final_output_path, index=False)
print(f"✅ Done! Final joined dataset saved to: {final_output_path}")


Loaded base table from exit_mastid.csv with shape (549768, 13)
Merging transcripts_mastid.csv with shape (183056, 9)


  df = pd.read_csv(file)


Merging masterbuild_master.csv with shape (1972904, 15)
Merging attendance_mastid.csv with shape (188021, 5)
Merging curtest_mastid.csv with shape (410826, 16)
Merging ec_unique.csv with shape (325992, 2)
✅ Done! Final joined dataset saved to: ../final_joined_students.csv


In [14]:
import pandas as pd
import numpy as np
final = pd.read_csv("../final_joined_students.csv")


  final = pd.read_csv("../final_joined_students.csv")


In [15]:
final['iep'] = final['plantype'].apply(lambda x: 1 if x == 'IEP' else 0)
final = final.drop(columns='plantype')


In [16]:
na_counts = final.isna().sum(axis=1)
missing_summary = na_counts.value_counts().sort_index()

print("Number of rows by count of missing values:")
print(missing_summary)


Number of rows by count of missing values:
1       1422
2      18044
3     100398
4      25981
5         16
6       1073
7         55
8         88
9         55
10       658
11      2784
12       973
13     10285
14    210001
15     21797
16     21846
17     29291
20        48
21       583
23        17
24       949
25      1392
26       289
27       835
28     19297
29     77940
41      3651
Name: count, dtype: int64
