In [43]:
import pandas as pd

In [44]:
def dat_to_csv_converter(dat_file_path):
    # Step 1: Read the .dat file using pandas
    # Assume .dat file is comma seperated
    try:
        data = pd.read_csv(dat_file_path, delimiter=',')
        csv_file_path = dat_file_path[:-4] + ".csv"
        # Step 3: Write the data to a .csv file
        data.to_csv(csv_file_path, index=False)  # Set index=False to omit row indices in the CSV
        return csv_file_path
    except Exception as e:
        print(e)

base_file_path = '../../data/'

final_dataframes = []

In [45]:
final_cols = [
 'schoolcode',
 'year',
 'charter',
 'mathpass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

main_dataframe_cols = [
 'schoolcode',
 'year',
 'mathpass',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
 'asian',
]

def final_data_generator(csv_file_path, year):
    raw_dataframe = pd.read_csv(csv_file_path)
    working_dataframe = raw_dataframe.copy()

    working_dataframe.columns = working_dataframe.columns.str.lower()
    working_dataframe['year'] = year
    working_dataframe['schoolcode'] = working_dataframe['campus']
    working_dataframe['district'] = working_dataframe['district']
    working_dataframe['totaltested'] = working_dataframe['a1_all_d'] 
    working_dataframe['asian'] = working_dataframe['a1_etha_d']
    working_dataframe['black'] = working_dataframe['a1_ethb_d']
    working_dataframe['white'] = working_dataframe['a1_ethw_d']
    working_dataframe['hispanic'] = working_dataframe['a1_ethh_d']
    working_dataframe['lowincome'] = working_dataframe['a1_eco2_d'] + working_dataframe['a1_eco1_d']
    working_dataframe['mathpass'] = working_dataframe['a1_all_meetsgl_nm'] if 'a1_all_meetsgl_nm' in working_dataframe else working_dataframe ["a1_all_satis_rec_nm"]  
    
    working_dataframe['asian'] /= working_dataframe['totaltested']
    working_dataframe['black'] /= working_dataframe['totaltested']
    working_dataframe['white'] /= working_dataframe['totaltested']
    working_dataframe['hispanic'] /= working_dataframe['totaltested']
    working_dataframe['mathpass'] /= working_dataframe['totaltested']
    working_dataframe['lowincome'] /= working_dataframe['totaltested']

    working_dataframe['asian'] *= 100
    working_dataframe['black'] *= 100
    working_dataframe['white'] *= 100
    working_dataframe['hispanic'] *= 100
    working_dataframe['lowincome'] *= 100
    working_dataframe['mathpass'] *= 100

    working_dataframe["mathpass"] = working_dataframe["mathpass"].round(2)
    working_dataframe["asian"] = working_dataframe["asian"].round(4)
    working_dataframe["white"] = working_dataframe["white"].round(4)
    working_dataframe["black"] = working_dataframe["black"].round(4)
    working_dataframe["hispanic"] = working_dataframe["hispanic"].round(4)
    working_dataframe["lowincome"] = working_dataframe["lowincome"].round(2)

    final_dataframe = working_dataframe

    return final_dataframe

In [46]:
# generate csv data from dat files
for year in range(2015, 2022):
    if year == 2020: continue
    current_file_path = base_file_path + "texas_math_" + str(year) + ".dat"
    final_csv_path = dat_to_csv_converter(current_file_path)
    final_dataframes.append(final_data_generator(final_csv_path, year))

final_dataframe = pd.concat(final_dataframes)
final_dataframe = final_dataframe[main_dataframe_cols]

final_dataframe.shape

(20624, 9)

In [47]:
# Find unique years in the DataFrame
unique_years = final_dataframe['year'].unique()

# Function to filter schools with complete data across all years
def filter_complete_data(group):
    group_years = group['year'].unique()
    
    # Check that the school has an entry for each year
    if set(group_years) != set(unique_years):
        return False
    
    # Check that there are no NaN values
    return not group.isna().any().any()

# Group by 'SchoolCode' and filter
final_dataframe = final_dataframe.groupby('schoolcode').filter(filter_complete_data)

final_dataframe.shape

(16644, 9)

In [48]:
final_dataframe.isna().sum()

schoolcode     0
year           0
mathpass       0
totaltested    0
lowincome      0
white          0
black          0
hispanic       0
asian          0
dtype: int64

In [49]:
final_dataframe.shape

(16644, 9)

# Import Virtual Data

In [50]:
virtual = pd.read_csv("../../data/raw_data/Texas_Schools_LearningModelData_Final.csv", thousands=',')
virtual['Charter'] = virtual['Charter'].replace({'Yes': 1, 'No': 0})

In [51]:
virtual = virtual[virtual['TimePeriodStart'].str.endswith(('21', '22'))]
virtual.head()

Unnamed: 0,StateName,StateAbbrev,DataLevel,Charter,SchoolName,SchoolType,NCESSchoolID,StateAssignedSchoolID,DistrictName,DistrictType,...,LearningModelGr912,LearningModelStateCat,LearningModelStateCatGrK5,LearningModelStateCatGr68,LearningModelStateCatGr912,EnrollmentInPerson,EnrollmentHybrid,EnrollmentVirtual,StaffCount,StaffCountInPerson
1,Texas,TX,School,0,Crosbyton Elementary,Regular school,480000101145,54901101,Crosbyton CISD,Regular local school district,...,,,,,,141.0,0.0,2.0,,
3,Texas,TX,School,0,Crosbyton Secondary,Regular school,480000101146,54901001,Crosbyton CISD,Regular local school district,...,,,,,,165.0,0.0,4.0,,
5,Texas,TX,School,0,Sp Ed Co-Op,Other/alternative school,480000103621,54901200,Crosbyton CISD,Regular local school district,...,,,,,,5.0,0.0,0.0,,
7,Texas,TX,School,0,Spur School,Regular school,480000204732,63903001,Spur ISD,Regular local school district,...,,,,,,219.0,0.0,13.0,,
9,Texas,TX,School,0,Rocksprings K-12,Regular school,480000304219,69901001,Rocksprings ISD,Regular local school district,...,,,,,,255.0,0.0,7.0,,


In [52]:
# Convert the date columns to datetime format.
virtual['TimePeriodStart'] = pd.to_datetime(virtual['TimePeriodStart'])
virtual['TimePeriodEnd'] = pd.to_datetime(virtual['TimePeriodEnd'])

# Create a new column for year
virtual['year'] = virtual['TimePeriodStart'].dt.year

# Fill in any missing values in LearningModel with 'InPerson'
virtual['LearningModel'] = virtual['LearningModel'].fillna('InPerson')

In [53]:
# Calculate the number of days for each row
virtual['Days'] = (virtual['TimePeriodEnd'] - virtual['TimePeriodStart']).dt.days

# Group by School, Year, LearningModel, and Charter and sum the number of days
grouped = virtual.groupby(['StateAssignedSchoolID', 'year', 'LearningModel', 'Charter'])['Days'].sum().reset_index()

# Pivot the data so we have separate columns for each learning model
pivot = grouped.pivot_table(index=['StateAssignedSchoolID', 'year', 'Charter'], columns='LearningModel', values='Days', fill_value=0)

In [54]:
# Reset the index
pivot.reset_index(inplace=True)

# Calculate the total days in each year
pivot['TotalDays'] = pivot['Virtual'] + pivot['InPerson']

# Calculate the percentage of days that are virtual and hybrid for each year
pivot['VirtualPercent'] = pivot['Virtual'] / pivot['TotalDays']

# Calculate the score for each year
pivot['Score'] = (pivot['Virtual'] / pivot['TotalDays'])

pivot.columns.name = None

In [55]:
pivot = pivot.drop(columns=['InPerson', "In-person", 'Virtual', 'TotalDays'])

pivot.fillna(0)

Unnamed: 0,StateAssignedSchoolID,year,Charter,VirtualPercent,Score
0,1902001,2021,0,0.0,0.0
1,1902041,2021,0,0.0,0.0
2,1902103,2021,0,0.0,0.0
3,1903001,2021,0,0.0,0.0
4,1903041,2021,0,0.0,0.0
...,...,...,...,...,...
8459,254901101,2021,0,0.0,0.0
8460,254901104,2021,0,0.0,0.0
8461,254901106,2021,0,0.0,0.0
8462,254902001,2021,0,1.0,1.0


# Begin Merge Process

In [56]:
# Find the intersection of unique SchoolCodes in both DataFrames
common_schoolcodes = set(final_dataframe['schoolcode'].unique()).intersection(set(virtual['StateAssignedSchoolID'].unique()))

# Filter both DataFrames to only include these SchoolCodes
final_dataframe = final_dataframe[final_dataframe['schoolcode'].isin(common_schoolcodes)]
virtual = virtual[virtual['StateAssignedSchoolID'].isin(common_schoolcodes)]



In [57]:
final_dataframe.shape

(16446, 9)

In [58]:
# Merge dropout_df with final_data
export_dataframe = pd.merge(final_dataframe, pivot, left_on=['schoolcode', 'year'], right_on=['StateAssignedSchoolID', 'year'], how='left')


In [59]:
export_dataframe.sort_values(by=['schoolcode', 'year'], inplace=True)

export_dataframe.head(10)


Unnamed: 0,schoolcode,year,mathpass,totaltested,lowincome,white,black,hispanic,asian,StateAssignedSchoolID,Charter,VirtualPercent,Score
0,1902001,2015,57.78,45,28.89,84.4444,2.2222,2.2222,4.4444,,,,
2741,1902001,2016,75.0,40,25.0,87.5,0.0,0.0,0.0,,,,
5482,1902001,2017,58.97,39,35.9,82.0513,5.1282,7.6923,0.0,,,,
8223,1902001,2018,69.77,43,32.56,86.0465,2.3256,6.9767,0.0,,,,
10964,1902001,2019,68.09,47,59.57,78.7234,2.1277,8.5106,0.0,,,,
13705,1902001,2021,65.0,40,45.0,82.5,2.5,12.5,2.5,1902001.0,0.0,,
1,1903001,2015,33.8,71,47.89,90.1408,4.2254,2.8169,0.0,,,,
2742,1903001,2016,26.03,73,54.79,87.6712,2.7397,8.2192,1.3699,,,,
5483,1903001,2017,38.37,86,58.14,74.4186,9.3023,9.3023,0.0,,,,
8224,1903001,2018,61.22,98,53.06,82.6531,3.0612,9.1837,0.0,,,,


In [60]:
export_dataframe['Charter'] = export_dataframe.groupby('schoolcode')['Charter'].ffill().bfill()
export_dataframe['StateAssignedSchoolID'] = export_dataframe.groupby('schoolcode')['StateAssignedSchoolID'].ffill().bfill()
export_dataframe["virtualper"] = export_dataframe["VirtualPercent"]
export_dataframe["schoolmode"] = export_dataframe["Score"]
export_dataframe["charter"] = export_dataframe["Charter"]
export_dataframe["hybridper"] = 0

export_dataframe = export_dataframe.fillna(0)
export_dataframe = export_dataframe.reset_index(drop=True)
export_dataframe = export_dataframe[final_cols]

export_dataframe["schoolcode"] = export_dataframe["schoolcode"].astype(str)

export_dataframe["state"] = "texas"
export_dataframe["schoolcode"] = export_dataframe["schoolcode"].str.zfill(9)
export_dataframe["countycode"] = export_dataframe["schoolcode"].str[:3].astype(str)
export_dataframe["districtcode"] = export_dataframe["schoolcode"].str[:6].astype(str)
export_dataframe['districtcode'] = export_dataframe['districtcode'].astype(str)
export_dataframe['countycode'] = export_dataframe['countycode'].astype(str)

export_dataframe["districtcode"] = export_dataframe["districtcode"] + "tx"
export_dataframe["countycode"] = export_dataframe["countycode"] + "tx"

final_cols = [
'schoolcode',
"districtcode",
"countycode",
 'year',
 'charter',
 'mathpass',
 'schoolmode',
 'virtualper',
 'hybridper',
 'totaltested',
 'lowincome',
 'white',
 'black',
 'hispanic',
      ]

export_dataframe.head(10)

Unnamed: 0,schoolcode,year,charter,mathpass,schoolmode,virtualper,hybridper,totaltested,lowincome,white,black,hispanic,asian,state,countycode,districtcode
0,1902001,2015,0.0,57.78,0.0,0.0,0,45,28.89,84.4444,2.2222,2.2222,4.4444,texas,001tx,001902tx
1,1902001,2016,0.0,75.0,0.0,0.0,0,40,25.0,87.5,0.0,0.0,0.0,texas,001tx,001902tx
2,1902001,2017,0.0,58.97,0.0,0.0,0,39,35.9,82.0513,5.1282,7.6923,0.0,texas,001tx,001902tx
3,1902001,2018,0.0,69.77,0.0,0.0,0,43,32.56,86.0465,2.3256,6.9767,0.0,texas,001tx,001902tx
4,1902001,2019,0.0,68.09,0.0,0.0,0,47,59.57,78.7234,2.1277,8.5106,0.0,texas,001tx,001902tx
5,1902001,2021,0.0,65.0,0.0,0.0,0,40,45.0,82.5,2.5,12.5,2.5,texas,001tx,001902tx
6,1903001,2015,0.0,33.8,0.0,0.0,0,71,47.89,90.1408,4.2254,2.8169,0.0,texas,001tx,001903tx
7,1903001,2016,0.0,26.03,0.0,0.0,0,73,54.79,87.6712,2.7397,8.2192,1.3699,texas,001tx,001903tx
8,1903001,2017,0.0,38.37,0.0,0.0,0,86,58.14,74.4186,9.3023,9.3023,0.0,texas,001tx,001903tx
9,1903001,2018,0.0,61.22,0.0,0.0,0,98,53.06,82.6531,3.0612,9.1837,0.0,texas,001tx,001903tx


In [61]:
export_dataframe[final_cols]

Unnamed: 0,schoolcode,districtcode,countycode,year,charter,mathpass,schoolmode,virtualper,hybridper,totaltested,lowincome,white,black,hispanic
0,001902001,001902tx,001tx,2015,0.0,57.78,0.0,0.0,0,45,28.89,84.4444,2.2222,2.2222
1,001902001,001902tx,001tx,2016,0.0,75.00,0.0,0.0,0,40,25.00,87.5000,0.0000,0.0000
2,001902001,001902tx,001tx,2017,0.0,58.97,0.0,0.0,0,39,35.90,82.0513,5.1282,7.6923
3,001902001,001902tx,001tx,2018,0.0,69.77,0.0,0.0,0,43,32.56,86.0465,2.3256,6.9767
4,001902001,001902tx,001tx,2019,0.0,68.09,0.0,0.0,0,47,59.57,78.7234,2.1277,8.5106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16441,254902001,254902tx,254tx,2016,0.0,35.14,0.0,0.0,0,37,40.54,2.7027,0.0000,97.2973
16442,254902001,254902tx,254tx,2017,0.0,29.09,0.0,0.0,0,55,36.36,5.4545,0.0000,94.5455
16443,254902001,254902tx,254tx,2018,0.0,42.11,0.0,0.0,0,38,65.79,2.6316,0.0000,97.3684
16444,254902001,254902tx,254tx,2019,0.0,44.12,0.0,0.0,0,34,52.94,0.0000,0.0000,100.0000


In [62]:
print(export_dataframe.dtypes)

schoolcode       object
year              int64
charter         float64
mathpass        float64
schoolmode      float64
virtualper      float64
hybridper         int64
totaltested       int64
lowincome       float64
white           float64
black           float64
hispanic        float64
asian           float64
state            object
countycode       object
districtcode     object
dtype: object


In [63]:
export_dataframe.to_csv("../final_data_component/final_data_tx_math.csv")