In [1]:
import duckdb
import glob
import os

# Path to base CSV and data folder
base_csv = '../exit2.csv'
folder_path = '../data'

# Start DuckDB session (persistent file)
con = duckdb.connect('student_merge.duckdb')

# Register base table
con.execute(f"""
    CREATE OR REPLACE TABLE base AS 
    SELECT * FROM read_csv_auto('{base_csv}', header=True, all_varchar=True)
""")
print(f"Registered base table from {os.path.basename(base_csv)}")

# Get list of all CSVs in folder, excluding the base
csv_files = [
    f for f in glob.glob(os.path.join(folder_path, '*.csv'))
    if not os.path.samefile(f, base_csv)
]

# Read and register remaining CSVs
table_names = []
for i, file in enumerate(csv_files):
    table_name = f"t{i}"
    table_names.append(table_name)

    con.execute(f"""
        CREATE OR REPLACE TABLE {table_name} AS 
        SELECT * FROM read_csv_auto('{file}', header=True, all_varchar=True)
    """)
    print(f"Registered {table_name} from {os.path.basename(file)}")

# Build LEFT JOIN chain starting from base
print("Performing left joins...")

join_sql = "SELECT * FROM base"
for table in table_names:
    join_sql = f"SELECT * FROM ({join_sql}) LEFT JOIN {table} USING (mastid)"

# Export to CSV
final_output_path = '../final_joined_students.csv'
con.execute(f"COPY ({join_sql}) TO '{final_output_path}' (HEADER, DELIMITER ',')")
print(f"✅ Done! Final joined dataset saved to: {final_output_path}")


Registered base table from exit2.csv
Registered t0 from access_master.csv
Registered t1 from masterbuild_master.csv
Registered t2 from collegeboard_master.csv
Registered t3 from ec_master.csv
Registered t4 from attendance_master.csv
Registered t5 from curtest_master.csv
Registered t6 from gpa_master.csv
Registered t7 from grad_master.csv
Performing left joins...
✅ Done! Final joined dataset saved to: ../final_joined_students.csv


In [2]:
import pandas as pd

final = pd.read_csv("../final_joined_students.csv")


  final = pd.read_csv("../final_joined_students.csv")


TypeError: 'tuple' object is not callable

In [3]:
final.shape

(7408636, 216)

In [1]:
import dask.dataframe as dd

df = dd.read_csv("../final_joined_students.csv",dtype=str)




In [2]:


# To get number of rows, compute it
rows_in_first = len(df.partitions[0].compute())
estimated_rows = rows_in_first * df.npartitions

print(f"Approx shape: ({estimated_rows}, {len(df.columns)})")


Approx shape: (42357249, 207)


In [3]:
print(df.dtypes)

grade                string[pyarrow]
lea                  string[pyarrow]
proflvl_rd           string[pyarrow]
proflvl_wr           string[pyarrow]
proflvl_ls           string[pyarrow]
                          ...       
diploma_type_code    string[pyarrow]
reporting_year       string[pyarrow]
collection_code_1    string[pyarrow]
course_study         string[pyarrow]
classification       string[pyarrow]
Length: 207, dtype: object


In [4]:
print(df.columns.tolist())


['grade', 'lea', 'proflvl_rd', 'proflvl_wr', 'proflvl_ls', 'proflvl_sp', 'proflvl_cmp', 'schlyear', 'cluster', 'scale_rd', 'scale_wr', 'scale_ls', 'scale_sp', 'scale_cmp', 'mastid', 'eds', 'eds_code', 'els', 'ethnic', 'fcs', 'grade_1', 'hms', 'irm_prof', 'lea_1', 'mil', 'schlcode', 'sex', 'swd', 'swd_code', 'birthdt', 'sex_1', 'grad_date', 'ethnic_1', 'blang', 'instname', 'psat_8_9_test_dt', 'psat_8_9_total', 'psat_8_9_ebrw', 'psat_8_9_math', 'psat_8_9_tsr', 'psat_8_9_tsw', 'psat_8_9_tsm', 'psat_8_9_sub_evid', 'psat_8_9_sub_cont', 'psat_8_9_sub_expr', 'psat_8_9_sub_conv', 'psat_8_9_sub_alge', 'psat_8_9_sub_prob', 'psat_8_9_ctsh', 'psat_8_9_ctss', 'psat_nmsqt_total', 'psat_nmsqt_ebrw', 'psat_nmsqt_math', 'psat_nmsqt_tsr', 'psat_nmsqt_tsw', 'psat_nmsqt_tsm', 'psat_nmsqt_sub_evid', 'psat_nmsqt_sub_cont', 'psat_nmsqt_sub_expr', 'psat_nmsqt_sub_conv', 'psat_nmsqt_sub_alge', 'psat_nmsqt_sub_prob', 'psat_nmsqt_sub_advm', 'psat_nmsqt_ctsh', 'psat_nmsqt_ctss', 'psat_10_test_dt', 'psat_10_total'