In [None]:
import pandas as pd
import glob
import os

# Path to base file and folder of other CSVs
base_csv = '../exit_mastid.csv'
folder_path = '../data'

# Load and convert mastid to float in base
base = pd.read_csv(base_csv)
base['mastid'] = pd.to_numeric(base['mastid'], errors='coerce')  # or .astype(float)
print(f"Loaded base table from {os.path.basename(base_csv)} with shape {base.shape}")

# Get list of other CSV files in the folder (excluding the base)
csv_files = [
    f for f in glob.glob(os.path.join(folder_path, '*.csv'))
    if not os.path.samefile(f, base_csv)
]

# Iteratively merge all other CSVs on mastid
merged = base.copy()
for file in csv_files:
    try:
        df = pd.read_csv(file)
        
        # Convert mastid to float to match base
        df['mastid'] = pd.to_numeric(df['mastid'], errors='coerce')

        print(f"Merging {os.path.basename(file)} with shape {df.shape}")

        # Drop duplicate mastids if needed
        df = df.drop_duplicates(subset='mastid')

        # Perform left join
        merged = merged.merge(df, on='mastid', how='left')
    except Exception as e:
        print(f"❌ Failed on {file}: {e}")

# Save final result
final_output_path = '../final_joined_students.csv'
merged.to_csv(final_output_path, index=False)
print(f"✅ Done! Final joined dataset saved to: {final_output_path}")









import pandas as pd
import numpy as np
final = pd.read_csv("../final_joined_students.csv")

final['iep'] = final['plantype'].apply(lambda x: 1 if x == 'IEP' else 0)
final = final.drop(columns='plantype')

final.drop(columns='irm_prof',inplace=True)
final['act'] = np.where(final['pc_ACCO'].isna(), 0, 1)
final.drop(columns='swd_code',inplace=True)
final = final[[col for col in final.columns if not col.endswith('y')]]
final.columns = [col[:-2] if col.endswith('_x') else col for col in final.columns]


final = final[[col for col in final.columns if 'unweighted' not in col.lower()]]
final = final.drop(columns='eds_code')
final = final.drop(columns='grade')
final_attendance = final.dropna(subset=['absent_pc_grade_9'])
na_counts = final_attendance.isna().sum(axis=1)
missing_summary = na_counts.value_counts().sort_index()

print("Number of rows by count of missing values:")
print(missing_summary)
final_attendance_4 = final_attendance[final_attendance.isna().sum(axis=1) <= 4]
final_attendance_4.drop(columns=['pc_ACCO','pc_BIOL'],inplace=True)
final_attendance_4.dropna(inplace=True)
final_attendance_4.to_csv("../good_final.csv", index=False) #this is a pretty good one
