# Data Cleaning: Institutional Charactertistics

In [1]:
# Relevant imports
import pandas as pd
import missingno as msno
import numpy as np

# See all rows and columns in the notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Section 1: Directory Information

Dataset cleaned: "Institutional Characteristics -- Directory information" from Integrated Postsecondary Education Data System via the National Center for Education Statistics. A data dictionary, obtained from the same source, was used to understand the names and definitions of the features.

### Load datasets & quick exploration

In [2]:
# Load dataset for directory information
#dir_2018 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2018.csv", encoding='latin1')
#dir_2019 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2019.csv", encoding='latin1')
dir_2020 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2020.csv", encoding='latin1')
dir_2021 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2021.csv", encoding='latin1')
dir_2022 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2022.csv", encoding='latin1')
dir_2023 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2023.csv", encoding='latin1')
dir_2024 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2024.csv", encoding='latin1')


In [3]:
# Shape of the dataframes
print(f"Shape of 2020 Dataframe:{dir_2020.shape}")
print(f"Shape of 2021 Dataframe:{dir_2021.shape}")
print(f"Shape of 2022 Dataframe:{dir_2022.shape}")
print(f"Shape of 2023 Dataframe:{dir_2023.shape}")
print(f"Shape of 2024 Dataframe:{dir_2024.shape}")

Shape of 2020 Dataframe:(6440, 73)
Shape of 2021 Dataframe:(6289, 74)
Shape of 2022 Dataframe:(6256, 73)
Shape of 2023 Dataframe:(6163, 73)
Shape of 2024 Dataframe:(6072, 72)


In [4]:
# Replace "empty" cells with NaN
dir_2020 = dir_2020.replace(r'^\s*$', np.nan, regex=True)
dir_2021 = dir_2021.replace(r'^\s*$', np.nan, regex=True)
dir_2022 = dir_2022.replace(r'^\s*$', np.nan, regex=True)
dir_2023 = dir_2023.replace(r'^\s*$', np.nan, regex=True)
dir_2024 = dir_2024.replace(r'^\s*$', np.nan, regex=True)

In [5]:
# Changing the name of the "UNITID" col in the dataframes for 2023 and 2024 
dir_2024 = dir_2024.rename(columns = {"ï»¿UNITID": "UNITID"})
dir_2023 = dir_2023.rename(columns = {"ï»¿UNITID": "UNITID"})

# Create a dictionary of the dataframes corresponding to their years of publication
dirs = {
    2020: dir_2020,
    2021: dir_2021,
    2022: dir_2022,
    2023: dir_2023,
    2024: dir_2024,
} 

In [6]:
# Create a list of all the columns present in all 5 datasets
all_cols = sorted(set().union(*[dir_df.columns for dir_df in dirs.values()])) # NOTE: Used ChatGPT to get this more concise line of code

# Create an empty dictionary to hold each year and its corresponding cols
yr_cols_dict = {}

for yr, df in dirs.items():

    # Create empty list to hold existing cols
    exist = []

    # Loop through each column name
    for col in all_cols:

        # Identify if column is present in each dataframe
        exist.append(col in df.columns) # Note: ChatGPT suggested this instead of the previous written "if" statement since lengths of df dont match across all and when calling "all_cols" as index later, it becomes an issue

    # Add the list of existing cols to the dictionary of 
    yr_cols_dict[yr] = exist

# Convert dictionary to dataframe
yr_cols_df = pd.DataFrame(yr_cols_dict, index = all_cols)

# See all rows
print(yr_cols_df)

               2020   2021   2022   2023   2024
ACT            True   True   True   True   True
ADDR           True   True   True   True   True
ADMINURL       True   True   True   True   True
APPLURL        True   True   True   True   True
ATHURL         True   True   True   True   True
C00CARNEGIE   False  False  False  False   True
C15BASIC       True   True   True   True  False
C18BASIC       True   True   True   True  False
C18ENPRF       True  False  False  False  False
C18IPGRD       True  False  False  False  False
C18IPUG        True  False  False  False  False
C18SZSET       True  False  False  False  False
C18UGPRF       True  False  False  False  False
C21BASIC      False   True   True   True   True
C21ENPRF      False   True   True   True  False
C21IPGRD      False   True   True   True  False
C21IPUG       False   True   True   True  False
C21SZSET      False   True   True   True  False
C21UGPRF      False   True   True   True  False
CARNEGIE       True   True   True   True

### Create dataset with only programs resembling vocational programs


From the data dictionary, we get this information:

Highest level of offering (generated, based on response to IC
survey)<br>
0 - Other<br>
1 - Postsecondary award, certificate or diploma of less than one academic year<br>
2 - Postsecondary award, certificate or diploma of at least one but less than two academic years<br>
3 - Associate's degree<br>
4 - Postsecondary award, certificate or diploma of at least two but less than four academic years<br>
5 - Bachelor's degree<br>
6 - Postbaccalaureate certificate<br>
7 - Master's degree<br>
8 - Post-master's certificate<br>
9 - Doctor's degree<br>
b - None of the above or no answer<br>
-2 - Not applicable, first-professional only<br>
-3 - Not Available

For this use case, we are interested in 1, 2, and 4.

In [33]:
# List of qualifiying HLOFFER values
list_of_offerings = [1, 2, 4]

voc_programs = {"voc_dir_2020": dir_2020, 
               "voc_dir_2021": dir_2021,
               "voc_dir_2022": dir_2022,
               "voc_dir_2023": dir_2023,
               "voc_dir_2024": dir_2024}

voc_dirs_df = {}

for name, val in voc_programs.items():
    print(f"Unqiue values for highest level of offering in the 2021 dataset: {val["HLOFFER"].unique()}")
    voc_dirs_df[name] = val[val["HLOFFER"].isin(list_of_offerings)]
    print(voc_dirs_df[name].shape)

for name, df in voc_dirs_df.items():
    globals()[name] = df

Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(2380, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(2320, 74)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(2277, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  4  2  8  5  1  6 -3]
(2219, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  2  8  5  1  4  6 -3]
(2178, 72)


#### Combine the datasets into a singular vocational schools dataset

In [None]:
# Create a dictionary of the dataframes corresponding to their years of publication
voc_dirs = {
    2020: voc_dir_2020,
    2021: voc_dir_2021,
    2022: voc_dir_2022,
    2023: voc_dir_2023,
    2024: voc_dir_2024,
} 

# Initialize empty list
voc_dirs_dfs = []

for yr, df in voc_dirs.items():

    # Create a copy of each dataframe
    voc_dir_copy = df.copy()

    # Add a publication year column
    voc_dir_copy["publication_yr"] = yr

    voc_dirs_dfs.append(voc_dir_copy)

# Concatenate the datasts
vocational_dir = pd.concat(voc_dirs_dfs, ignore_index = True, sort = False)

# Print preview of the new dataset
vocational_dir.shape

(11374, 89)

In [35]:
# Dedup logic: Each UNITID should only be present a maximum of 5 times and only one for each year at the maximum
for i in vocational_dir["UNITID"].unique():
    i_count = (vocational_dir["UNITID"] == i).sum()
    if i_count > 5:
        print(f"There are duplicate institution IDs. Note the following UNIT IDs: {i}")
    

if vocational_dir.duplicated(subset=["UNITID", "publication_yr"], keep=False).any():
    print(f"There are duplicate institution IDs by year.")

### Create dataset with only undergraduate programs

In [36]:
ug_programs = {"ug_dir_2020": dir_2020, 
               "ug_dir_2021": dir_2021,
               "ug_dir_2022": dir_2022,
               "ug_dir_2023": dir_2023,
               "ug_dir_2024": dir_2024}

ug_dirs_df = {}

for name, val in ug_programs.items():
    print(f"Unqiue values for highest level of offering in the 2021 dataset: {val["HLOFFER"].unique()}")
    ug_dirs_df[name] = val[val["HLOFFER"] == 5]
    print(ug_dirs_df[name].shape)

for name, df in ug_dirs_df.items():
    globals()[name] = df

Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(740, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(702, 74)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  5  2  8  4  1  6 -3]
(712, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  4  2  8  5  1  6 -3]
(713, 73)
Unqiue values for highest level of offering in the 2021 dataset: [ 9  3  7  2  8  5  1  4  6 -3]
(712, 72)


#### Combine the datasets into a singular undergrad schools dataset

In [None]:
# Create a dictionary of the dataframes corresponding to their years of publication
ug_dirs = {
    2020: ug_dir_2020,
    2021: ug_dir_2021,
    2022: ug_dir_2022,
    2023: ug_dir_2023,
    2024: ug_dir_2024,
} 

# Initialize empty list
ug_dirs_dfs = []

for yr, df in ug_dirs.items():

    # Create a copy of each dataframe
    ug_dir_copy = df.copy()

    # Add a publication year column
    ug_dir_copy["publication_yr"] = yr

    ug_dirs_dfs.append(ug_dir_copy)

# Concatenate the datasts
undergrad_dir = pd.concat(ug_dirs_dfs, ignore_index = True, sort = False)

# Print preview of the new dataset
undergrad_dir.shape

(3579, 89)

In [38]:
# Dedup logic: Each UNITID should only be present a maximum of 5 times and only one for each year at the maximum
for i in undergrad_dir["UNITID"].unique():
    i_count = (undergrad_dir["UNITID"] == i).sum()
    if i_count > 5:
        print(f"There are duplicate institution IDs. Note the following UNIT IDs: {i}")
    

if undergrad_dir.duplicated(subset=["UNITID", "publication_yr"], keep=False).any():
    print(f"There are duplicate institution IDs by year.")

### Remapping existing column names to more user-friendly names

In [39]:
# Use data dictionary to change column names into something more readable
dict_2024 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2024_Dictionary/Varlist-Table 1.csv", encoding='latin1')

# Create a dictionary with varName and varTitle
col_names_mapping = dict(zip(dict_2024["varName"], dict_2024["varTitle"]))

# Map the column names 
vocational_dir.rename(columns = col_names_mapping, inplace=True)
undergrad_dir.rename(columns = col_names_mapping, inplace=True)

In [40]:
# Preview vocational dataset
vocational_dir.head(2)

Unnamed: 0,Unique identification number of the institution,Institution (entity) name,Institution name alias,Street address or post office box,City location of institution,State abbreviation,ZIP code,FIPS state code,Bureau of Economic Analysis (BEA) regions,Name of chief administrator,Title of chief administrator,General information telephone number,Employer Identification Number,DUNS,Office of Postsecondary Education (OPE) ID Number,OPE Title IV eligibility indicator code,Institution's internet website address,Admissions office web address,Financial aid office web address,Online application web address,Net price calculator web address,Veterans and Military Servicemembers tuition policies web address,Student-Right-to-Know student athlete graduation rate web address,Disability Services Web Address,Sector of institution,Level of institution,Control of institution,Highest level of offering,Undergraduate offering,Graduate offering,Highest degree offered,Degree-granting status,Historically Black College or University,Institution has hospital,Institution grants a medical degree,Tribal college,Degree of urbanization (Urban-centric locale),Institution open to the general public,Status of institution,UNITID for merged schools,Year institution was deleted from IPEDS,Date institution closed,Institution is active in current year,Primarily postsecondary indicator,Postsecondary institution indicator,Postsecondary and Title IV institution indicator,"Reporting method for student charges, graduation rates, retention rates and student financial aid",Institutional category,C18BASIC,C18IPUG,C18IPGRD,C18UGPRF,C18ENPRF,C18SZSET,C15BASIC,CCBASIC,CARNEGIE,Land Grant Institution,Institution size category,Multi-institution or multi-campus organization,Name of multi-institution or multi-campus organization,Identification number of multi-institution or multi-campus organization,Core Based Statistical Area (CBSA),CBSA Type Metropolitan or Micropolitan,Combined Statistical Area (CSA),NECTA,Fips County code,County name,State and 118TH Congressional District ID,Longitude location of institution,Latitude location of institution,Data Feedback Report comparison group created by NCES,Data Feedback Report - Institution submitted a custom comparison group,publication_yr,Carnegie Classification 2021: Basic,C21IPUG,C21IPGRD,C21UGPRF,C21ENPRF,C21SZSET,Unique Entity Identifier (UEI) Numbers,2000 Carnegie Classification (historical - not updated),Carnegie Classification 2025: Institutional Classification,Carnegie Classification 2025: Student Access and Earnings,Carnegie Classification 2025: Research Activity Designation,Carnegie Classification 2025: Institutional Size,Carnegie Classification 2025: Award Level Focus,Carnegie Classification 2025: Undergraduate Academic Program Mix,Carnegie Classification 2025: Graduate Academic Program Mix
0,101277,New Beginning College of Cosmetology,,421 Martling Road,Albertville,AL,35951,1,5,Amanda Baugh,DIRECTOR,2568786430,202711032,4452546,4187200,1,www.nbccosmetology.com/,,,,nbccosmetology.com/wp-content/themes/nbc/nbc_z...,,,www.nbccosmetology.com/,9,3,3,2,1,2,0,2,2,-2,2,2,32,1,A,-2,-2,-2,1,1,1,1,2,6,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,2,1,2,-2,-2,10700,2,-2,-2.0,1095,Marshall County,104,-86.196931,34.27871,32,1,2020,,,,,,,,,,,,,,,
1,102711,Alaska Vocational Technical Center,AVTEC,PO Box 889,Seward,AK,99664-0889,2,8,Cathy LeCompte,Director,9072243322,926001185,120327366,3160300,1,www.avtec.edu/,,,www.avtec.edu/book-page/application,www.avtec.edu/book-page/net-price-calculator,,,www.avtec.edu/,7,3,1,2,1,2,0,2,2,-2,2,2,43,1,A,-2,-2,-2,1,1,1,1,2,6,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,10.0,-2.0,-2.0,2,1,2,-2,-2,-2,-2,-2,-2.0,2122,Kenai Peninsula Borough,200,-149.444487,60.110626,211,2,2020,,,,,,,,,,,,,,,


In [41]:
# Preview undergrad dataset
undergrad_dir.head(2)

Unnamed: 0,Unique identification number of the institution,Institution (entity) name,Institution name alias,Street address or post office box,City location of institution,State abbreviation,ZIP code,FIPS state code,Bureau of Economic Analysis (BEA) regions,Name of chief administrator,Title of chief administrator,General information telephone number,Employer Identification Number,DUNS,Office of Postsecondary Education (OPE) ID Number,OPE Title IV eligibility indicator code,Institution's internet website address,Admissions office web address,Financial aid office web address,Online application web address,Net price calculator web address,Veterans and Military Servicemembers tuition policies web address,Student-Right-to-Know student athlete graduation rate web address,Disability Services Web Address,Sector of institution,Level of institution,Control of institution,Highest level of offering,Undergraduate offering,Graduate offering,Highest degree offered,Degree-granting status,Historically Black College or University,Institution has hospital,Institution grants a medical degree,Tribal college,Degree of urbanization (Urban-centric locale),Institution open to the general public,Status of institution,UNITID for merged schools,Year institution was deleted from IPEDS,Date institution closed,Institution is active in current year,Primarily postsecondary indicator,Postsecondary institution indicator,Postsecondary and Title IV institution indicator,"Reporting method for student charges, graduation rates, retention rates and student financial aid",Institutional category,C18BASIC,C18IPUG,C18IPGRD,C18UGPRF,C18ENPRF,C18SZSET,C15BASIC,CCBASIC,CARNEGIE,Land Grant Institution,Institution size category,Multi-institution or multi-campus organization,Name of multi-institution or multi-campus organization,Identification number of multi-institution or multi-campus organization,Core Based Statistical Area (CBSA),CBSA Type Metropolitan or Micropolitan,Combined Statistical Area (CSA),NECTA,Fips County code,County name,State and 118TH Congressional District ID,Longitude location of institution,Latitude location of institution,Data Feedback Report comparison group created by NCES,Data Feedback Report - Institution submitted a custom comparison group,publication_yr,Carnegie Classification 2021: Basic,C21IPUG,C21IPGRD,C21UGPRF,C21ENPRF,C21SZSET,Unique Entity Identifier (UEI) Numbers,2000 Carnegie Classification (historical - not updated),Carnegie Classification 2025: Institutional Classification,Carnegie Classification 2025: Student Access and Earnings,Carnegie Classification 2025: Research Activity Designation,Carnegie Classification 2025: Institutional Size,Carnegie Classification 2025: Award Level Focus,Carnegie Classification 2025: Undergraduate Academic Program Mix,Carnegie Classification 2025: Graduate Academic Program Mix
0,100937,Birmingham-Southern College,BSC,900 Arkadelphia Road,Birmingham,AL,35254,1,5,Mr. Daniel Coleman,President,2052264600,630708730,67136580,101200,1,www.bsc.edu/,www.bsc.edu/admission/index.html,https://www.bsc.edu/fp/index.html,https://www.bsc.edu/admission/apply.html,www.bsc.edu/fp/np-calculator.cfm,https://www.bsc.edu/fp/veterans.html,https://www.bsc.edu/ire/student-consumer-infor...,https://www.bsc.edu/campus/accomodations/index...,2,1,2,5,1,2,30,1,2,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,21.0,9.0,0.0,14.0,2.0,11.0,21.0,21.0,31.0,2,2,2,-2,-2,13820,1,142,-2.0,1073,Jefferson County,107,-86.850552,33.513774,132,1,2020,,,,,,,,,,,,,,,
1,101435,Huntingdon College,,1500 East Fairview Avenue,Montgomery,AL,36106-2148,1,5,J. Cameron West,President,3348334497,630288841,82145608,101900,1,www.huntingdon.edu/,www.huntingdon.edu/admission-aid/,www.huntingdon.edu/admission-aid/student-finan...,www.huntingdon.edu/admission-aid/traditional-a...,hawk.huntingdon.edu/oiac/netpricecalculator/in...,,,www.huntingdon.edu/academics/academic-resource...,2,1,2,5,1,2,30,1,2,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,22.0,15.0,0.0,8.0,2.0,7.0,22.0,22.0,31.0,2,1,2,-2,-2,33860,1,388,-2.0,1101,Montgomery County,102,-86.284366,32.351034,139,1,2020,,,,,,,,,,,,,,,


In [47]:
# Export the two datasets

vocational_dir.to_csv("../Cleaned data/cleaned_vocational_directory_information.csv", index=False)  
undergrad_dir.to_csv("../Cleaned data/cleaned_undergrad_directory_information.csv", index=False)  

## Section 2: 12M Enrollment Demographics

In [42]:
# Load dataset for directory information
enroll_19_20 = pd.read_csv("../../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_19-20.csv", encoding='latin1')
enroll_20_21 = pd.read_csv("../../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_20-21.csv", encoding='latin1')
enroll_21_22 = pd.read_csv("../../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_21-22.csv", encoding='latin1')
enroll_22_23 = pd.read_csv("../../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_22-23.csv", encoding='latin1')
enroll_23_24 = pd.read_csv("../../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_23-24.csv", encoding='latin1')

In [43]:
# Shape of the dataframes
print(f"Shape of 2020 Dataframe:{enroll_19_20.shape}")
print(f"Shape of 2021 Dataframe:{enroll_20_21.shape}")
print(f"Shape of 2022 Dataframe:{enroll_21_22.shape}")
print(f"Shape of 2023 Dataframe:{enroll_22_23.shape}")
print(f"Shape of 2024 Dataframe:{enroll_23_24.shape}")

Shape of 2020 Dataframe:(103723, 64)
Shape of 2021 Dataframe:(103615, 64)
Shape of 2022 Dataframe:(117543, 72)
Shape of 2023 Dataframe:(116437, 72)
Shape of 2024 Dataframe:(115026, 72)


In [44]:
# Replace "empty" cells with NaN
enroll_19_20 = enroll_19_20.replace(r'^\s*$', np.nan, regex=True)
enroll_20_21 = enroll_20_21.replace(r'^\s*$', np.nan, regex=True)
enroll_21_22 = enroll_21_22.replace(r'^\s*$', np.nan, regex=True)
enroll_22_23 = enroll_22_23.replace(r'^\s*$', np.nan, regex=True)
enroll_23_24 = enroll_23_24.replace(r'^\s*$', np.nan, regex=True)

In [45]:
# Changing the name of the "UNITID" col in the dataframes for 2023 and 2024 
enroll_23_24 = enroll_23_24.rename(columns = {"ï»¿UNITID": "UNITID"})
enroll_22_23 = enroll_22_23.rename(columns = {"ï»¿UNITID": "UNITID"})

#### Map column names to user-friendly names using IPEDS-provided data dictionary

In [None]:
# Import dictionary
enroll_dict_2024 = pd.read_csv("../Data/2024/Institutional-Characteristics_12M-Enrollment_Demographic_23-24_Dictionary/Varlist-Table 1.csv")

# Create a dictionary with varName and varTitle
ecol_names_mapping = dict(zip(enroll_dict_2024["varName"], enroll_dict_2024["varTitle"]))

# Map the column names 
vocational_dir.rename(columns = col_names_mapping, inplace=True)
undergrad_dir.rename(columns = col_names_mapping, inplace=True)

## Section 3: Education Offerings

## Section 4: Awards Demographics

## Section 5: Admissions & Test Scores

## Section 6: Financial Aid & Net Price

## Section 7: Graduation Rates

## Section 8: Student Charges for Vocational Schools

## Section 9: Total Cost & Attendance