# Import Libraries

In [35]:
import pandas as pd
import numpy as np
import csv
# import pyodbc
import os
import warnings

# Constants

In [36]:
# Define the years for each dataframe
years = [2016, 2017, 2018, 2019, 2021, 2022]

In [37]:
def safe_convert(val):
    try:
        return int(val)
    except ValueError:
        print(f"Value {val} can't be converted to int")
        return None
    
def import_mdb(MDBs, DRV, PWD, NAMES):
    
    databases = {}
    
    for MDB, NAME in zip(MDBs, NAMES):
        # connect to db
        con = pyodbc.connect('DRIVER={};DBQ={};PWD={}'.format(DRV,MDB,PWD))
        cur = con.cursor()

        # List all tables in the database
        tables = list(map(lambda t: t.table_name, con.cursor().tables(tableType='TABLE')))

        # Initialize an empty dictionary to hold your dataframes and databases
        database = {}

        # Try to read each table one by one
        for table in tables:
            try:
                df = pd.read_sql(f'SELECT * FROM [{table}]', con)  # enclose table name in brackets
                database[table] = df
                print(f"Successfully read table: {table}")
            except Exception as e:
                print(f"Failed to read table: {table}")
                print(f"Error: {e}")
        databases[NAME] = database
        
    return databases

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore', 'pandas only support SQLAlchemy connectable.*')
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
# warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning)

# Import Enrollment Data

In [38]:
# Enroll Data Filepaths
ENROLL_PATHS = [
               '../../data/enrollment_2018/ENROLL2018.csv',
               '../../data/enrollment_2019/ENROLL2019.csv',
               '../../data/enrollment_2021/ENROLL2021.csv',
               '../../data/enrollment_2022/ENROLL2022.csv',
              ]
ENROLL_NAMES = [
               'ENROLL2018',
               'ENROLL2019',
               'ENROLL2021',
               'ENROLL2022',
              ]

data16 = None
data17 = None
data18 = pd.read_csv(ENROLL_PATHS[0])
data19 = pd.read_csv(ENROLL_PATHS[1])
data21 = pd.read_csv(ENROLL_PATHS[2])
data22 = pd.read_csv(ENROLL_PATHS[3])

data16 = data18[data18["YEAR"] == 2016]
data17 = data18[data18["YEAR"] == 2017]
data18 = data18[data18["YEAR"] == 2018]
data19 = data19[data19["YEAR"] == 2019]
data21 = data21[data21["YEAR"] == 2021]
data22 = data22[data22["YEAR"] == 2022]

enrollment_dataframes = [data16, data17, data18, data19, data21, data22]

In [39]:
first_set_cols = [
'schoolcode',
"districtcode",
"countycode",
 'year',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'state'
      ]

for i, dataframe in enumerate(enrollment_dataframes):
    enrollment_dataframes[i] = enrollment_dataframes[i][~enrollment_dataframes[i]['ENTITY_CD'].astype(str).str.endswith('0000')]
    enrollment_dataframes[i]['ENTITY_CD'] = enrollment_dataframes[i]['ENTITY_CD'].apply(safe_convert)
    enrollment_dataframes[i]['schoolcode'] = enrollment_dataframes[i]['ENTITY_CD']
    enrollment_dataframes[i]['black'] = enrollment_dataframes[i]['PER_BLACK']
    enrollment_dataframes[i]['white'] = enrollment_dataframes[i]['PER_WHITE']
    enrollment_dataframes[i]['hispanic'] = enrollment_dataframes[i]['PER_HISP']
    enrollment_dataframes[i]['lowincome'] = enrollment_dataframes[i]['PER_ECDIS']
    enrollment_dataframes[i]["state"] = "nyc"
    enrollment_dataframes[i]["schoolcode"] = enrollment_dataframes[i]["schoolcode"].astype(str)
    enrollment_dataframes[i]["schoolcode"] = enrollment_dataframes[i]["schoolcode"].str.zfill(9)
    enrollment_dataframes[i]["countycode"] = enrollment_dataframes[i]["schoolcode"].str[:3].astype(str)
    enrollment_dataframes[i]["districtcode"] = enrollment_dataframes[i]["schoolcode"].str[:6].astype(str)
    enrollment_dataframes[i]['districtcode'] = enrollment_dataframes[i]['districtcode'].astype(str)
    enrollment_dataframes[i]['countycode'] = enrollment_dataframes[i]['countycode'].astype(str)

    enrollment_dataframes[i]["districtcode"] = enrollment_dataframes[i]["districtcode"] + enrollment_dataframes[i]["state"]
    enrollment_dataframes[i]["countycode"] = enrollment_dataframes[i]["countycode"] + enrollment_dataframes[i]["state"]
    enrollment_dataframes[i]["year"] = years[i]
    enrollment_dataframes[i] = enrollment_dataframes[i][first_set_cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_dataframes[i]['ENTITY_CD'] = enrollment_dataframes[i]['ENTITY_CD'].apply(safe_convert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_dataframes[i]['schoolcode'] = enrollment_dataframes[i]['ENTITY_CD']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_dataframes[i]['bl

In [40]:
enrollment_df = pd.concat(enrollment_dataframes)
enrollment_df = enrollment_df.reset_index(drop=True)

In [41]:
enrollment_df

Unnamed: 0,schoolcode,districtcode,countycode,year,lowincome,white,black,hispanic,state
0,10100010014,101000nyc,101nyc,2016,31,57,21,10,nyc
1,10100010016,101000nyc,101nyc,2016,66,17,43,21,nyc
2,10100010018,101000nyc,101nyc,2016,69,12,32,37,nyc
3,10100010019,101000nyc,101nyc,2016,30,51,23,12,nyc
4,10100010020,101000nyc,101nyc,2016,78,6,68,17,nyc
...,...,...,...,...,...,...,...,...,...
28430,680601060001,680601nyc,680nyc,2022,47,90,1,5,nyc
28431,680601060002,680601nyc,680nyc,2022,56,93,0,3,nyc
28432,680601060005,680601nyc,680nyc,2022,59,91,0,5,nyc
28433,680801040001,680801nyc,680nyc,2022,60,92,1,3,nyc


# Import Dropout Rates

In [42]:
data22 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2022.csv", thousands=',')
data21 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2021.csv", thousands=',')
data19 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2019.csv", thousands=',')
data18 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2018.csv", thousands=',')
data17 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2017.csv", thousands=',')
data16 = pd.read_csv("../../data/raw_data/GRAD_RATE_AND_OUTCOMES_2016.csv", thousands=',')

dropout_dfs = [data16, data17, data18, data19, data21, data22]

# Combine Dropout Data

In [43]:
# Remove districts, only keep schools
for i, df in enumerate(dropout_dfs):
    dropout_dfs[i].columns = dropout_dfs[i].columns.str.lower()
    dropout_dfs[i] = df[df['aggregation_type'] == 'School']

#   Only keep schools which are present in all years    #
#########################################################

# Convert the 'ID' column of each DataFrame to a set
set1 = set(dropout_dfs[0]['aggregation_code'])
set2 = set(dropout_dfs[1]['aggregation_code'])
set3 = set(dropout_dfs[2]['aggregation_code'])
set4 = set(dropout_dfs[3]['aggregation_code'])
set5 = set(dropout_dfs[4]['aggregation_code'])
set6 = set(dropout_dfs[5]['aggregation_code'])

# Find the intersection of all 4 sets - i.e., the common IDs
common_ids = set1 & set2 & set3 & set4 & set5 & set6

# Filter each DataFrame to only include rows with a common ID
for i, df in enumerate(dropout_dfs):
    dropout_dfs[i] = df[df['aggregation_code'].isin(common_ids)]
    
###########################################################

common_ids = set(df['aggregation_code'])

# Initialize a list to store the updated dataframes
updated_dfs = []

# Iterate over the dropout dataframes and the years together
for year, df in zip(years, dropout_dfs):
    # Add a new column 'year' to the dataframe
    df['year'] = year
    # Append the updated dataframe to the list
    updated_dfs.append(df)

# Concatenate the updated dataframes together
dropout_df = pd.concat(updated_dfs)

# drop disttricts from the dataframe
dropout_df = dropout_df[~dropout_df['aggregation_code'].astype(str).str.endswith('0000.0')]

# Reset the index of the combined dataframe
dropout_df = dropout_df.reset_index(drop=True)

common_ids = set(dropout_df['aggregation_code'])

In [44]:
dropout_df = dropout_df[dropout_df["membership_desc"].str.contains("4 Year Outcome", na=False)]
dropout_df = dropout_df[~dropout_df["membership_desc"].str.contains("August", na=False)]
dropout_df = dropout_df[dropout_df["subgroup_name"].str.lower() == "all students"]
dropout_df["schoolcode"] = dropout_df["aggregation_code"].astype(str)
dropout_df["report_school_year"] = dropout_df["report_school_year"].astype(str)
dropout_df["year"] = "20" + dropout_df["report_school_year"].str[-2:]
dropout_df["year"] = dropout_df["year"].astype(int)
dropout_df["dropout"] =  dropout_df["dropout_pct"].str.replace('%', '', regex=False).replace('-', '0', regex=False).astype(float)
dropout_df = dropout_df[["year", "schoolcode", "dropout"]]

In [45]:
dropout_df

Unnamed: 0,year,schoolcode,dropout
34,2016,10100010034,21.0
91,2016,10100860907,3.0
138,2016,10100860960,10.0
189,2016,10201040001,6.0
243,2016,10306060008,3.0
...,...,...,...
597235,2022,670401040001,3.0
597337,2022,671201060002,4.0
597439,2022,671501040002,8.0
597548,2022,680601060001,10.0


In [46]:
dropout_df.drop_duplicates(inplace=True)

In [47]:
enrollment_dropout = dropout_df.merge(enrollment_df, on=['schoolcode', 'year'], how='inner')

In [48]:
enrollment_dropout

Unnamed: 0,year,schoolcode,dropout,districtcode,countycode,lowincome,white,black,hispanic,state
0,2016,10100010034,21.0,101000nyc,101nyc,52,20,53,14,nyc
1,2016,10100860907,3.0,101008nyc,101nyc,77,2,81,13,nyc
2,2016,10100860960,10.0,101008nyc,101nyc,92,2,85,8,nyc
3,2016,10201040001,6.0,102010nyc,102nyc,39,97,1,1,nyc
4,2016,10306060008,3.0,103060nyc,103nyc,12,87,3,3,nyc
...,...,...,...,...,...,...,...,...,...,...
7253,2022,670401040001,3.0,670401nyc,670nyc,40,96,2,1,nyc
7254,2022,671201060002,4.0,671201nyc,671nyc,51,89,0,7,nyc
7255,2022,671501040002,8.0,671501nyc,671nyc,45,92,1,3,nyc
7256,2022,680601060001,10.0,680601nyc,680nyc,47,90,1,5,nyc


# Import Total Enrollment Data

In [49]:
# Enroll Data Filepaths
TOTAL_PATHS = [
               '../../data/enrollment_2018/TOTAL2018.csv',
               '../../data/enrollment_2019/TOTAL2019.csv',
               '../../data/enrollment_2021/TOTAL2021.csv',
               '../../data/enrollment_2022/TOTAL2022.csv',
              ]

data16 = None
data17 = None
data18 = pd.read_csv(TOTAL_PATHS[0])
data19 = pd.read_csv(TOTAL_PATHS[1])
data21 = pd.read_csv(TOTAL_PATHS[2])
data22 = pd.read_csv(TOTAL_PATHS[3])

data16 = data18[data18["YEAR"] == 2016]
data17 = data18[data18["YEAR"] == 2017]
data18 = data18[data18["YEAR"] == 2018]
data19 = data19[data19["YEAR"] == 2019]
data21 = data21[data21["YEAR"] == 2021]
data22 = data22[data22["YEAR"] == 2022]

total_enrollment_dataframes = [data16, data17, data18, data19, data21, data22]

In [50]:
total_enrollment_dataframes[0]

Unnamed: 0,ENTITY_CD,ENTITY_NAME,YEAR,PK,PKHALF,PKFULL,KHALF,KFULL,1,2,3,4,5,6,7,8,9,10,11,12,UGE,UGS,K12
18,1,NYC Public Schools,2016,71430,2819,68611,0,73497,75783,74904,74991,72622,70451,66287,66798,67720,82681,80231,65541,62599,9691,11207,955003
19,2,Large Cities,2016,7316,1044,6272,0,8411,8782,8681,8590,8262,7566,7546,7416,7392,9271,7785,6343,6601,607,879,104132
20,3,High Need/Resource Urban-Suburban Districts,2016,12088,6997,5091,565,16008,16942,16885,16410,16039,15230,14629,14963,14756,16936,15696,14085,14063,978,1301,205486
21,4,High Need/Resource Rural Districts,2016,6705,3476,3229,0,11111,11177,11368,11089,11066,10751,10877,11094,11047,11970,11281,10814,10743,423,774,145585
22,5,Average Need Districts,2016,19079,14303,4776,1831,50115,53228,54473,55228,55878,55555,56883,57914,57644,60712,60467,58232,59619,2294,4076,744149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15505,680601060002,PENN YAN MIDDLE SCHOOL,2016,0,0,0,0,0,0,0,0,0,0,108,117,99,0,0,0,0,0,0,324
15506,680601060005,PENN YAN ELEMENTARY SCHOOL,2016,34,0,34,0,97,126,98,87,99,78,0,0,0,0,0,0,0,2,0,587
15507,680801040000,DUNDEE CENTRAL SCHOOL DISTRICT,2016,58,18,40,0,40,60,45,47,33,57,44,60,52,66,38,53,64,4,7,670
15508,680801040001,DUNDEE JUNIOR-SENIOR HIGH SCHOOL,2016,0,0,0,0,0,0,0,0,0,0,0,60,52,66,38,53,64,0,2,335


In [51]:
first_set_cols = [
'schoolcode',
"totalenrolled",
 'year',
      ]

for i, dataframe in enumerate(total_enrollment_dataframes):
    total_enrollment_dataframes[i] = total_enrollment_dataframes[i][~total_enrollment_dataframes[i]['ENTITY_CD'].astype(str).str.endswith('0000')]
    total_enrollment_dataframes[i]['ENTITY_CD'] = total_enrollment_dataframes[i]['ENTITY_CD'].apply(safe_convert)
    total_enrollment_dataframes[i]['schoolcode'] = total_enrollment_dataframes[i]['ENTITY_CD']
    total_enrollment_dataframes[i]['schoolcode'] = total_enrollment_dataframes[i]['schoolcode'].astype(str)
    total_enrollment_dataframes[i]['totalenrolled'] = (total_enrollment_dataframes[i]['9'] + 
                                               total_enrollment_dataframes[i]['10'] +
                                               total_enrollment_dataframes[i]['11'] +
                                               total_enrollment_dataframes[i]['12'] 
                                               )

    total_enrollment_dataframes[i]["year"] =  total_enrollment_dataframes[i]["YEAR"]
    total_enrollment_dataframes[i] = total_enrollment_dataframes[i][first_set_cols]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_enrollment_dataframes[i]['ENTITY_CD'] = total_enrollment_dataframes[i]['ENTITY_CD'].apply(safe_convert)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_enrollment_dataframes[i]['schoolcode'] = total_enrollment_dataframes[i]['ENTITY_CD']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tota

In [52]:
total_enrollment_dataframe = pd.concat(total_enrollment_dataframes)

In [53]:
total_enrollment_dataframe

Unnamed: 0,schoolcode,totalenrolled,year
18,1,291052,2016
19,2,30000,2016
20,3,60780,2016
21,4,44808,2016
22,5,239030,2016
...,...,...,...
16584,680601060001,383,2022
16585,680601060002,0,2022
16586,680601060005,0,2022
16588,680801040001,183,2022


In [54]:
enrollment_dropout_total = enrollment_dropout.merge(total_enrollment_dataframe, on=['schoolcode', 'year'], how='inner')

In [55]:
enrollment_dropout_total

Unnamed: 0,year,schoolcode,dropout,districtcode,countycode,lowincome,white,black,hispanic,state,totalenrolled
0,2016,10100010034,21.0,101000nyc,101nyc,52,20,53,14,nyc,2445
1,2016,10100860907,3.0,101008nyc,101nyc,77,2,81,13,nyc,349
2,2016,10100860960,10.0,101008nyc,101nyc,92,2,85,8,nyc,365
3,2016,10201040001,6.0,102010nyc,102nyc,39,97,1,1,nyc,261
4,2016,10306060008,3.0,103060nyc,103nyc,12,87,3,3,nyc,1619
...,...,...,...,...,...,...,...,...,...,...,...
7253,2022,670401040001,3.0,670401nyc,670nyc,40,96,2,1,nyc,255
7254,2022,671201060002,4.0,671201nyc,671nyc,51,89,0,7,nyc,234
7255,2022,671501040002,8.0,671501nyc,671nyc,45,92,1,3,nyc,258
7256,2022,680601060001,10.0,680601nyc,680nyc,47,90,1,5,nyc,383


# Import Virtual Mode Data

In [56]:
virtual = pd.read_csv("../../data/New_York_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [57]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,NCESDistrictID,StateAssignedDistrictID,TimePeriodInterval,TimePeriodStart,TimePeriodEnd,EnrollmentTotal,LearningModel,LearningModelGrK5,LearningModelGr68,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
12,New York,NY,School,0,James A Green High School,Regular school,360000104498,211003040002,Dolgeville Central School District,Regular local school district,3600001,211003040000,Weekly,1/3/21,1/9/21,378,Virtual,,,,Remote Only,,,,79.0,0.0,299.0,81.0,45.0
13,New York,NY,School,0,James A Green High School,Regular school,360000104498,211003040002,Dolgeville Central School District,Regular local school district,3600001,211003040000,Weekly,1/10/21,1/16/21,378,Hybrid,,,,Hybrid,,,,0.0,237.0,141.0,81.0,81.0
14,New York,NY,School,0,James A Green High School,Regular school,360000104498,211003040002,Dolgeville Central School District,Regular local school district,3600001,211003040000,Weekly,1/17/21,1/23/21,378,Hybrid,,,,Hybrid,,,,0.0,226.0,152.0,81.0,76.0
15,New York,NY,School,0,James A Green High School,Regular school,360000104498,211003040002,Dolgeville Central School District,Regular local school district,3600001,211003040000,Weekly,1/24/21,1/30/21,378,Hybrid,,,,Hybrid,,,,0.0,229.0,149.0,81.0,81.0
16,New York,NY,School,0,James A Green High School,Regular school,360000104498,211003040002,Dolgeville Central School District,Regular local school district,3600001,211003040000,Weekly,1/31/21,2/6/21,378,Hybrid,,,,Hybrid,,,,0.0,230.0,148.0,81.0,80.0


In [58]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['YEAR'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

# Replace 'In-person' with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].replace('In-person', 'InPerson')

# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

# # Group by School, Year, and LearningModel and sum the number of days
# grouped = virtual.groupby(['StateAssignedSchoolID', 'YEAR', 'LearningModel'])['Days'].sum().reset_index()

# # Pivot the data so we have separate columns for each learning model
# pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'YEAR'], columns='LearningModel', values='Days', fill_value=0)

# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['Hybrid'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']
pivot['HybridPercent'] = pivot['Hybrid'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] + 0.5 * pivot['Hybrid']) / pivot['TotalDays']

# Reset the column names after pivot
pivot.columns.name = None

  virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
  virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])


In [59]:
pivot = pivot.drop(columns=['InPerson', 'Hybrid', 'Virtual', 'TotalDays'])
pivot
pivot["schoolcode"] = pivot["StateAssignedSchoolID"].astype(str)
pivot["year"] = pivot["YEAR"]
pivot["virtualper"] = pivot["VirtualPercent"]
pivot["hybridper"] = pivot["HybridPercent"]
pivot["schoolmode"] = pivot["Score"]
pivot["charter"] = pivot["Charter"]
pivot = pivot[["schoolcode", "schoolmode", "virtualper", "schoolmode", "hybridper", "charter"]]

pivot

Unnamed: 0,schoolcode,schoolmode,virtualper,schoolmode.1,hybridper,charter
0,10100010014,0.48,0.04,0.48,0.88,0
1,10100010016,0.48,0.04,0.48,0.88,0
2,10100010018,0.48,0.04,0.48,0.88,0
3,10100010019,0.48,0.04,0.48,0.88,0
4,10100010023,0.48,0.04,0.48,0.88,0
...,...,...,...,...,...,...
4398,680601060001,0.08,0.08,0.08,0.00,0
4399,680601060002,0.08,0.08,0.08,0.00,0
4400,680601060005,0.08,0.08,0.08,0.00,0
4401,680801040001,0.08,0.08,0.08,0.00,0


In [60]:
export_dataframe = enrollment_dropout_total.merge(pivot, on = "schoolcode")
export_dataframe.loc[export_dataframe["year"] != 2021, ["hybridper", "virtualper", "schoolmode"]] = 0
export_dataframe["schoolcode"] = export_dataframe["schoolcode"] + "nyc"

# Identify the unique years in the dataset.
unique_years = set(export_dataframe['year'])

# Group the data by ENTITY_CD and get the unique years for each group.
entity_groups = export_dataframe.groupby('schoolcode')['year'].unique().reset_index()

# Find the ENTITY_CD values which have all the unique years.
valid_entity_cd = entity_groups[entity_groups['year'].apply(lambda x: set(x) == unique_years)]['schoolcode']

# Filter the main data for these ENTITY_CD values.
export_dataframe = export_dataframe[export_dataframe['schoolcode'].isin(valid_entity_cd)]
export_dataframe.shape

(6942, 16)

In [61]:
export_dataframe

Unnamed: 0,year,schoolcode,dropout,districtcode,countycode,lowincome,white,black,hispanic,state,totalenrolled,schoolmode,virtualper,schoolmode.1,hybridper,charter
0,2016,10100010034nyc,21.0,101000nyc,101nyc,52,20,53,14,nyc,2445,0.00,0.00,0.00,0.00,0
1,2017,10100010034nyc,17.0,101000nyc,101nyc,55,20,53,15,nyc,2451,0.00,0.00,0.00,0.00,0
2,2018,10100010034nyc,18.0,101000nyc,101nyc,64,22,51,15,nyc,2548,0.00,0.00,0.00,0.00,0
3,2019,10100010034nyc,17.0,101000nyc,101nyc,66,22,50,15,nyc,2470,0.00,0.00,0.00,0.00,0
4,2021,10100010034nyc,6.0,101000nyc,101nyc,66,20,51,17,nyc,2524,0.48,0.04,0.48,0.88,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6960,2017,680801040001nyc,16.0,680801nyc,680nyc,61,95,1,2,nyc,200,0.00,0.00,0.00,0.00,0
6961,2018,680801040001nyc,4.0,680801nyc,680nyc,63,97,1,2,nyc,203,0.00,0.00,0.00,0.00,0
6962,2019,680801040001nyc,7.0,680801nyc,680nyc,60,94,0,3,nyc,198,0.00,0.00,0.00,0.00,0
6963,2021,680801040001nyc,9.0,680801nyc,680nyc,60,94,0,2,nyc,179,0.08,0.08,0.08,0.00,0


In [62]:
export_dataframe = export_dataframe[export_dataframe["year"] != 2022]
export_dataframe.describe()

Unnamed: 0,year,dropout,lowincome,white,black,hispanic,totalenrolled,schoolmode,virtualper,schoolmode.1,hybridper,charter
count,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0,5785.0
mean,2018.2,5.91962,56.544857,49.706655,17.717373,24.077615,654.611927,0.096266,0.035029,0.096266,0.122475,0.0
std,1.720614,6.632254,24.986644,39.097391,22.418443,25.053139,660.621368,0.210723,0.108369,0.210723,0.271245,0.0
min,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2017.0,1.0,38.0,5.0,1.0,3.0,270.0,0.0,0.0,0.0,0.0,0.0
50%,2018.0,4.0,56.0,59.0,7.0,13.0,423.0,0.0,0.0,0.0,0.0,0.0
75%,2019.0,8.0,80.0,90.0,27.0,43.0,830.0,0.0,0.0,0.0,0.0,0.0
max,2021.0,66.0,100.0,100.0,93.0,100.0,5943.0,0.88,0.84,0.88,1.0,0.0


In [65]:
export_dataframe

Unnamed: 0,year,schoolcode,dropout,districtcode,countycode,lowincome,white,black,hispanic,state,totalenrolled,schoolmode,virtualper,schoolmode.1,hybridper,charter
0,2016,10100010034nyc,21.0,101000nyc,101nyc,52,20,53,14,nyc,2445,0.00,0.00,0.00,0.00,0
1,2017,10100010034nyc,17.0,101000nyc,101nyc,55,20,53,15,nyc,2451,0.00,0.00,0.00,0.00,0
2,2018,10100010034nyc,18.0,101000nyc,101nyc,64,22,51,15,nyc,2548,0.00,0.00,0.00,0.00,0
3,2019,10100010034nyc,17.0,101000nyc,101nyc,66,22,50,15,nyc,2470,0.00,0.00,0.00,0.00,0
4,2021,10100010034nyc,6.0,101000nyc,101nyc,66,20,51,17,nyc,2524,0.48,0.04,0.48,0.88,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6959,2016,680801040001nyc,13.0,680801nyc,680nyc,56,95,1,2,nyc,221,0.00,0.00,0.00,0.00,0
6960,2017,680801040001nyc,16.0,680801nyc,680nyc,61,95,1,2,nyc,200,0.00,0.00,0.00,0.00,0
6961,2018,680801040001nyc,4.0,680801nyc,680nyc,63,97,1,2,nyc,203,0.00,0.00,0.00,0.00,0
6962,2019,680801040001nyc,7.0,680801nyc,680nyc,60,94,0,3,nyc,198,0.00,0.00,0.00,0.00,0


In [64]:
export_dataframe.to_csv("../final_data_component/final_data_nyc_dropout.csv")