# Notebook Title

## Setup Python and R environment
you can ignore this section

In [5]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (16, 100)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import glob

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [3]:
%%R

# My commonly used R imports

require('tidyverse')

── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.5.1     ✔ purrr   1.0.1
✔ tibble  3.2.1     ✔ dplyr   1.1.4
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()


Loading required package: tidyverse
1: package ‘ggplot2’ was built under R version 4.2.3 
2: package ‘dplyr’ was built under R version 4.2.3 


## 👉 download your data

You can write code here to download your dataset. Or if you already have it, just leave the URL in the comments and just load it into a pandas or R (or both) dataframe.

In [12]:
# https://civilrightsdata.ed.gov/data

In [11]:
# Get all CSV files in the current directory
csv_files = glob.glob("../data/2021-22-crdc-data/SCH/*.csv")

# Initialize empty list to store all dataframes
all_dfs = []

# First, let's read all CSV files and identify shared columns
all_columns = set()
shared_columns = set()
first_file = True

# Read each CSV and track columns
for file in csv_files:
    df = pd.read_csv(file)
    print(f"Loaded {file}, shape: {df.shape}")
    
    # Store the dataframe
    all_dfs.append(df)
    
    # Initialize shared columns with the first file
    if first_file:
        shared_columns = set(df.columns)
        first_file = False
    else:
        # Update shared columns (intersection)
        shared_columns = shared_columns.intersection(set(df.columns))
    
    # Update all columns (union)
    all_columns = all_columns.union(set(df.columns))

# Make sure COMBOKEY is in all dataframes
if "COMBOKEY" not in shared_columns:
    print("Error: COMBOKEY must be present in all CSV files")
    exit(1)

print(f"Shared columns across all files: {shared_columns}")
print(f"Total unique columns across all files: {len(all_columns)}")

# Start with the first dataframe
result_df = all_dfs[0]

# Merge with all other dataframes
for i, df in enumerate(all_dfs[1:], 2):
    # For shared columns (except COMBOKEY), we'll only keep from the first dataframe
    # by dropping them from the current dataframe before merging
    columns_to_drop = [col for col in shared_columns if col != "COMBOKEY"]
    df_cleaned = df.drop(columns=columns_to_drop, errors='ignore')
    
    # Merge on COMBOKEY
    result_df = pd.merge(result_df, df_cleaned, on="COMBOKEY", how="left")
    print(f"Merged file {i}/{len(csv_files)}, current shape: {result_df.shape}")

Loaded ../2021-22-crdc-data/SCH/Referrals and Arrests.csv, shape: (98010, 84)
Loaded ../2021-22-crdc-data/SCH/School Support.csv, shape: (98010, 19)
Loaded ../2021-22-crdc-data/SCH/Algebra II.csv, shape: (98010, 29)
Loaded ../2021-22-crdc-data/SCH/Internet Access and Devices.csv, shape: (98010, 13)
Loaded ../2021-22-crdc-data/SCH/Geometry.csv, shape: (98010, 31)
Loaded ../2021-22-crdc-data/SCH/Corporal Punishment.csv, shape: (98010, 72)
Loaded ../2021-22-crdc-data/SCH/Interscholastic Athletics.csv, shape: (98010, 18)
Loaded ../2021-22-crdc-data/SCH/Biology.csv, shape: (98010, 29)
Loaded ../2021-22-crdc-data/SCH/Advanced Mathematics.csv, shape: (98010, 29)
Loaded ../2021-22-crdc-data/SCH/Justice Facilities.csv, shape: (98010, 16)
Loaded ../2021-22-crdc-data/SCH/COVID Directional Indicators.csv, shape: (98010, 12)
Loaded ../2021-22-crdc-data/SCH/Dual Enrollment.csv, shape: (98010, 29)
Loaded ../2021-22-crdc-data/SCH/Algebra I.csv, shape: (98010, 132)
Loaded ../2021-22-crdc-data/SCH/SAT a

In [20]:
zip_codes = pd.read_csv("../data/2021-22-crdc-data/LEA/LEA Characteristics.csv")
merged_df = pd.merge(result_df, zip_codes, on="LEAID", how="left")

In [22]:
merged_df.shape

(98010, 1900)

In [25]:
zip_codes.head()

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,LEA_ADDRESS,LEA_CITY,LEA_ZIP,CJJ,LEA_CRCOORD_SEX_IND,LEA_CRCOORD_RAC_IND,...,LEA_HBPOLICY_IND,LEA_HBPOLICYURL_IND,LEA_HBPOLICY_URL,LEA_ENR,LEA_PSENR_A3,LEA_PSENR_A4,LEA_PSENR_A5,LEA_ENR_NONLEAFAC,LEA_SCHOOLS,LEA_PS_IND
0,AL,ALABAMA,100002,Alabama Youth Services,1000 Industrial School Road,Mt Meigs,36057,Yes,Yes,Yes,...,Yes,No,-9,175,-9,-9,-9,0,1,No
1,AL,ALABAMA,100005,Albertville City,8379 US 431,Albertville,35950,No,Yes,Yes,...,Yes,Yes,https://simbli.eboardsolutions.com/Policy/View...,5902,0,122,0,0,6,Yes
2,AL,ALABAMA,100006,Marshall County,12380 US Highway 431 S,Guntersville,35976,No,Yes,Yes,...,Yes,Yes,https://www.marshallk12.org/site/handlers/file...,5813,6,20,205,0,12,Yes
3,AL,ALABAMA,100007,Hoover City,2810 Metropolitan Way,Hoover,35243,No,Yes,Yes,...,Yes,Yes,https://resources.finalsite.net/images/v169574...,13543,1,135,185,0,16,Yes
4,AL,ALABAMA,100008,Madison City,211 Celtic Drive,Madison,35758,No,Yes,Yes,...,Yes,Yes,https://www.madisoncity.k12.al.us/Page/2122,12241,32,272,40,0,11,Yes


In [23]:
merged_df.to_csv("../2021-22-crdc-data/merged_data.csv", index=False)

In [None]:
# gonna skip since i'm aggregating by zip code!