In [31]:
import pandas as pd
import glob
import os

# Path to base file and folder of other CSVs
base_csv = '../exit_mastid.csv'
folder_path = '../data'

# Load and convert mastid to float in base
base = pd.read_csv(base_csv)
base['mastid'] = pd.to_numeric(base['mastid'], errors='coerce')  # or .astype(float)
print(f"Loaded base table from {os.path.basename(base_csv)} with shape {base.shape}")

# Get list of other CSV files in the folder (excluding the base)
csv_files = [
    f for f in glob.glob(os.path.join(folder_path, '*.csv'))
    if not os.path.samefile(f, base_csv)
]

# Iteratively merge all other CSVs on mastid
merged = base.copy()
for file in csv_files:
    try:
        df = pd.read_csv(file)
        
        # Convert mastid to float to match base
        df['mastid'] = pd.to_numeric(df['mastid'], errors='coerce')

        print(f"Merging {os.path.basename(file)} with shape {df.shape}")

        # Drop duplicate mastids if needed
        df = df.drop_duplicates(subset='mastid')

        # Perform left join
        merged = merged.merge(df, on='mastid', how='left')
    except Exception as e:
        print(f"❌ Failed on {file}: {e}")

# Save final result
final_output_path = '../final_joined_students.csv'
merged.to_csv(final_output_path, index=False)
print(f"✅ Done! Final joined dataset saved to: {final_output_path}")


Loaded base table from exit_mastid.csv with shape (549768, 13)
Merging transcripts_mastid.csv with shape (183056, 9)


  df = pd.read_csv(file)


Merging masterbuild_master.csv with shape (1972904, 15)
Merging attendance_mastid.csv with shape (214929, 10)
Merging curtest_mastid.csv with shape (410826, 16)
Merging ec_unique.csv with shape (325992, 2)
✅ Done! Final joined dataset saved to: ../final_joined_students.csv


In [32]:
import pandas as pd
import numpy as np
final = pd.read_csv("../final_joined_students.csv")


  final = pd.read_csv("../final_joined_students.csv")


In [33]:
final['iep'] = final['plantype'].apply(lambda x: 1 if x == 'IEP' else 0)
final = final.drop(columns='plantype')

final.drop(columns='irm_prof',inplace=True)
final['act'] = np.where(final['pc_ACCO'].isna(), 0, 1)
final.drop(columns='swd_code',inplace=True)
final = final[[col for col in final.columns if not col.endswith('y')]]
final.columns = [col[:-2] if col.endswith('_x') else col for col in final.columns]


final = final[[col for col in final.columns if 'unweighted' not in col.lower()]]
final = final.drop(columns='eds_code')
final = final.drop(columns='grade')

In [34]:
final.columns

Index(['aig', 'eds', 'els', 'ethnic', 'fcs', 'hms', 'lea', 'mastid', 'mig',
       'schlcode', 'sex', 'swd', 'exit_code_desc', 'gpa_weighted_cum_grade_9',
       'gpa_weighted_cum_grade_10', 'gpa_weighted_cum_grade_11',
       'gpa_weighted_cum_grade_12', 'mil', 'absent_pc_grade_3',
       'absent_pc_grade_4', 'absent_pc_grade_5', 'absent_pc_grade_6',
       'absent_pc_grade_7', 'absent_pc_grade_8', 'absent_pc_grade_9',
       'absent_pc_grade_10', 'absent_pc_grade_11', 'pc_ACCO', 'pc_BIOL',
       'pc_MA04', 'pc_MA05', 'pc_MA06', 'pc_MA07', 'pc_MA08', 'pc_MTH1',
       'pc_RD04', 'pc_RD05', 'pc_RD06', 'pc_RD07', 'pc_RD08', 'pc_SC05',
       'pc_SC08', 'iep', 'act'],
      dtype='object')

In [35]:
na_counts = final.isna().sum(axis=1)
missing_summary = na_counts.value_counts().sort_index()

print("Number of rows by count of missing values:")
print(missing_summary)


Number of rows by count of missing values:
0      11778
1      86808
2      24348
4      71562
5       6321
6       3024
7         34
9       2976
10     16261
11      4906
13    145788
14     18119
15     23780
16       639
19      5504
20        32
24     30461
28     93808
29      3619
Name: count, dtype: int64


In [36]:
final_attendance = final.dropna(subset=['absent_pc_grade_9'])

In [37]:
na_counts = final_attendance.isna().sum(axis=1)
missing_summary = na_counts.value_counts().sort_index()

print("Number of rows by count of missing values:")
print(missing_summary)

Number of rows by count of missing values:
0     11778
1     86808
2     24348
4     71562
5      6321
6      3024
7        34
15     5518
19     5504
20       32
Name: count, dtype: int64


In [38]:
final_attendance_4 = final_attendance[final_attendance.isna().sum(axis=1) <= 4]

In [39]:
final_attendance_4.drop(columns=['pc_ACCO','pc_BIOL'],inplace=True)
final_attendance_4.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_attendance_4.drop(columns=['pc_ACCO','pc_BIOL'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_attendance_4.dropna(inplace=True)


In [40]:
final_attendance_4.to_csv(final_output_path, index=False) #this is a pretty good one
