# Data Cleaning: Institutional Charactertistics

In [1]:
# Relevant imports
import pandas as pd
import missingno as msno
import numpy as np

# See all rows and columns in the notebook
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Section 1: Directory Information

Dataset cleaned: "Institutional Characteristics -- Directory information" from Integrated Postsecondary Education Data System via the National Center for Education Statistics. A data dictionary, obtained from the same source, was used to understand the names and definitions of the features.

### Load datasets & quick exploration

In [2]:
# Load dataset for directory information
dir_2018 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2018.csv", encoding='latin1')
dir_2019 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2019.csv", encoding='latin1')
dir_2020 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2020.csv", encoding='latin1')
dir_2021 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2021.csv", encoding='latin1')
dir_2022 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2022.csv", encoding='latin1')
dir_2023 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2023.csv", encoding='latin1')
dir_2024 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2024.csv", encoding='latin1')


In [3]:
# Shape of the dataframes
print(f"Shape of 2018 Dataframe:{dir_2018.shape}")
print(f"Shape of 2019 Dataframe:{dir_2019.shape}")
print(f"Shape of 2020 Dataframe:{dir_2020.shape}")
print(f"Shape of 2021 Dataframe:{dir_2021.shape}")
print(f"Shape of 2022 Dataframe:{dir_2022.shape}")
print(f"Shape of 2023 Dataframe:{dir_2023.shape}")
print(f"Shape of 2024 Dataframe:{dir_2024.shape}")

Shape of 2018 Dataframe:(6857, 73)
Shape of 2019 Dataframe:(6559, 73)
Shape of 2020 Dataframe:(6440, 73)
Shape of 2021 Dataframe:(6289, 74)
Shape of 2022 Dataframe:(6256, 73)
Shape of 2023 Dataframe:(6163, 73)
Shape of 2024 Dataframe:(6072, 72)


In [4]:
# Replace "empty" cells with NaN
dir_2018 = dir_2018.replace(r'^\s*$', np.nan, regex=True)
dir_2019 = dir_2019.replace(r'^\s*$', np.nan, regex=True)
dir_2020 = dir_2020.replace(r'^\s*$', np.nan, regex=True)
dir_2021 = dir_2021.replace(r'^\s*$', np.nan, regex=True)
dir_2022 = dir_2022.replace(r'^\s*$', np.nan, regex=True)
dir_2023 = dir_2023.replace(r'^\s*$', np.nan, regex=True)
dir_2024 = dir_2024.replace(r'^\s*$', np.nan, regex=True)

In [5]:
# Changing the name of the "UNITID" col in the dataframes for 2023 and 2024 
dir_2024 = dir_2024.rename(columns = {"ï»¿UNITID": "UNITID"})
dir_2023 = dir_2023.rename(columns = {"ï»¿UNITID": "UNITID"})

# Create a dictionary of the dataframes corresponding to their years of publication
dirs = {
    2018: dir_2018,
    2019: dir_2019,
    2020: dir_2020,
    2021: dir_2021,
    2022: dir_2022,
    2023: dir_2023,
    2024: dir_2024,
} 

In [6]:
# Create a list of all the columns present in all 5 datasets
all_cols = sorted(set().union(*[dir_df.columns for dir_df in dirs.values()])) # NOTE: Used ChatGPT to get this more concise line of code

# Create an empty dictionary to hold each year and its corresponding cols
yr_cols_dict = {}

for yr, df in dirs.items():

    # Create empty list to hold existing cols
    exist = []

    # Loop through each column name
    for col in all_cols:

        # Identify if column is present in each dataframe
        exist.append(col in df.columns) # Note: ChatGPT suggested this instead of the previous written "if" statement since lengths of df dont match across all and when calling "all_cols" as index later, it becomes an issue

    # Add the list of existing cols to the dictionary of 
    yr_cols_dict[yr] = exist

# Convert dictionary to dataframe
yr_cols_df = pd.DataFrame(yr_cols_dict, index = all_cols)

# See all rows
print(yr_cols_df)

               2018   2019   2020   2021   2022   2023   2024
ACT            True   True   True   True   True   True   True
ADDR           True   True   True   True   True   True   True
ADMINURL       True   True   True   True   True   True   True
APPLURL        True   True   True   True   True   True   True
ATHURL         True   True   True   True   True   True   True
C00CARNEGIE   False  False  False  False  False  False   True
C15BASIC       True   True   True   True   True   True  False
C18BASIC       True   True   True   True   True   True  False
C18ENPRF       True   True   True  False  False  False  False
C18IPGRD       True   True   True  False  False  False  False
C18IPUG        True   True   True  False  False  False  False
C18SZSET       True   True   True  False  False  False  False
C18UGPRF       True   True   True  False  False  False  False
C21BASIC      False  False  False   True   True   True   True
C21ENPRF      False  False  False   True   True   True  False
C21IPGRD

### Combine all years' datasets

In [7]:
# Create a dictionary of the dataframes corresponding to their years of publication
dirs = {
    2018: dir_2018, 
    2019: dir_2019,
    2020: dir_2020,
    2021: dir_2021,
    2022: dir_2022,
    2023: dir_2023,
    2024: dir_2024,
} 

# Initialize empty list
dirs_dfs = []

for yr, df in dirs.items():

    # Create a copy of each dataframe
    dir_copy = df.copy()

    # Add a publication year column
    dir_copy["publication_yr"] = yr

    dirs_dfs.append(dir_copy)

# Concatenate the datasts
directory = pd.concat(dirs_dfs, ignore_index = True, sort = False)

# Print preview of the new dataset
directory.shape

(44636, 89)

In [8]:
# Dedup logic: Each UNITID should only be present a maximum of 7 times and only one for each year at the maximum
for i in directory["UNITID"].unique():
    i_count = (directory["UNITID"] == i).sum()
    if i_count > 7:
        print(f"There are duplicate institution IDs. Note the following UNIT IDs: {i}")
    

if directory.duplicated(subset=["UNITID", "publication_yr"], keep=False).any():
    print(f"There are duplicate institution IDs by year: {directory["UNITID"]}")

### Mapping degree names from data dictionary

In [9]:
# Use data dictionary to change column names into something more readable
dict_dir_2024 = pd.read_csv("../../Data/2024/Institutional-Characteristics_Directory-Information_2024_Dictionary/Varlist-Table 1.csv", encoding='latin1')


From the data dictionary, we get this information:

Highest level of offering (generated, based on response to IC
survey)<br>
0 - Other<br>
1 - Postsecondary award, certificate or diploma of less than one academic year<br>
2 - Postsecondary award, certificate or diploma of at least one but less than two academic years<br>
3 - Associate's degree<br>
4 - Postsecondary award, certificate or diploma of at least two but less than four academic years<br>
5 - Bachelor's degree<br>
6 - Postbaccalaureate certificate<br>
7 - Master's degree<br>
8 - Post-master's certificate<br>
9 - Doctor's degree<br>
b - None of the above or no answer<br>
-2 - Not applicable, first-professional only<br>
-3 - Not Available

In [10]:
#map values back to the column
hloffer_dict = {
    0: "Other",
    1: "Postsecondary award, certificate or diploma of less than one academic year",
    2: "Postsecondary award, certificate or diploma of at least one but less than two academic years",
    3: "Associate's degree",
    4: "Postsecondary award, certificate or diploma of at least two but less than four academic years",
    5: "Bachelor's degree",
    6: "Postbaccalaureate certificate",
    7: "Master's degree",
    8: "Post-master's certificate",
    9: "Doctor's degree",
    "b": "None of the above or no answer",
    "-2": "Not applicable, first-professional only",
    "-3": "Not Available"
}

directory["HLOFFER"] = directory["HLOFFER"].replace(hloffer_dict)


In [11]:
directory.head()

Unnamed: 0,UNITID,INSTNM,IALIAS,ADDR,CITY,STABBR,ZIP,FIPS,OBEREG,CHFNM,CHFTITLE,GENTELE,EIN,DUNS,OPEID,OPEFLAG,WEBADDR,ADMINURL,FAIDURL,APPLURL,NPRICURL,VETURL,ATHURL,DISAURL,SECTOR,ICLEVEL,CONTROL,HLOFFER,UGOFFER,GROFFER,HDEGOFR1,DEGGRANT,HBCU,HOSPITAL,MEDICAL,TRIBAL,LOCALE,OPENPUBL,ACT,NEWID,DEATHYR,CLOSEDAT,CYACTIVE,POSTSEC,PSEFLAG,PSET4FLG,RPTMTH,INSTCAT,C18BASIC,C18IPUG,C18IPGRD,C18UGPRF,C18ENPRF,C18SZSET,C15BASIC,CCBASIC,CARNEGIE,LANDGRNT,INSTSIZE,F1SYSTYP,F1SYSNAM,F1SYSCOD,CBSA,CBSATYPE,CSA,NECTA,COUNTYCD,COUNTYNM,CNGDSTCD,LONGITUD,LATITUDE,DFRCGID,DFRCUSCG,publication_yr,C21BASIC,C21IPUG,C21IPGRD,C21UGPRF,C21ENPRF,C21SZSET,UEIS,C00CARNEGIE,CARNEGIEIC,CARNEGIESAEC,CARNEGIERSCH,CARNEGIESIZE,CARNEGIEALF,CARNEGIEAPM,CARNEGIEGPM
0,100654,Alabama A & M University,AAMU,4900 Meridian Street,Normal,AL,35762,1,5,"Dr. Andrew Hugine, Jr.",President,2563725000,636001109,197216455,100200,1,www.aamu.edu/,www.aamu.edu/Admissions/Pages/default.aspx,www.aamu.edu/admissions/fincialaid/pages/defau...,https://www.aamu.edu/Admissions/UndergraduateA...,https://galileo.aamu.edu/NetPriceCalculator/np...,,,www.aamu.edu/administrativeoffices/VADS/Pages/...,1,1,1,Doctor's degree,1,1,12,1,1,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,18.0,16.0,17.0,10.0,4.0,14.0,18.0,18.0,16.0,1,3,2,-2,-2,26620,1,290,-2.0,1089,Madison County,105,-86.568502,34.783368,119,1,2018,,,,,,,,,,,,,,,
1,100663,University of Alabama at Birmingham,,Administration Bldg Suite 1070,Birmingham,AL,35294-0110,1,5,Ray L. Watts,President,2059344011,636005396,63690705,105200,1,www.uab.edu,www.uab.edu/students/undergraduate-admissions,www.uab.edu/students/paying-for-college,https://idm.uab.edu/myuab/login?from=ugadmapp,uab.studentaidcalculator.com/survey.aspx,www.uab.edu/students/veterans,www.uab.edu/registrar/students,www.uab.edu/students/disability/,1,1,1,Doctor's degree,1,1,11,1,2,1,1,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,15.0,17.0,17.0,9.0,5.0,15.0,15.0,15.0,15.0,2,5,1,The University of Alabama System,101050,13820,1,142,-2.0,1073,Jefferson County,107,-86.799345,33.505697,105,1,2018,,,,,,,,,,,,,,,
2,100690,Amridge University,Southern Christian University |Regions University,1200 Taylor Rd,Montgomery,AL,36117-3553,1,5,Michael C.Turner,President,33438738777528,237034324,126307792,2503400,1,www.amridgeuniversity.edu,www.amridgeuniversity.edu/admissions/,www.amridgeuniversity.edu/financialaid/,https://www2.amridgeuniversity.edu/Amridge/Log...,www2.amridgeuniversity.edu:9091/,www.amridgeuniversity.edu/admissions/military/,,www.amridgeuniversity.edu/pdf/Amridge%20Univer...,2,1,2,Doctor's degree,1,1,12,1,2,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,20.0,19.0,18.0,5.0,5.0,6.0,20.0,21.0,51.0,2,1,2,-2,-2,33860,1,-2,-2.0,1101,Montgomery County,102,-86.17401,32.362609,137,2,2018,,,,,,,,,,,,,,,
3,100706,University of Alabama in Huntsville,UAH |University of Alabama Huntsville,301 Sparkman Dr,Huntsville,AL,35899,1,5,Robert A. Altenkirch,President,2568246120,630520830,949687123,105500,1,www.uah.edu,https://www.uah.edu/admissions,finaid.uah.edu/,register.uah.edu,finaid.uah.edu/,www.uah.edu/admissions/graduate/financial-aid/...,www.uah.edu/heoa,www.uah.edu/health-and-wellness/disability-sup...,1,1,1,Doctor's degree,1,1,11,1,2,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,16.0,17.0,17.0,15.0,4.0,12.0,16.0,15.0,16.0,2,3,1,The University of Alabama System,101050,26620,1,290,-2.0,1089,Madison County,105,-86.640449,34.724557,109,2,2018,,,,,,,,,,,,,,,
4,100724,Alabama State University,,915 S Jackson Street,Montgomery,AL,36104-0271,1,5,Quinton T. Ross,President,3342294100,636001101,40672685,100500,1,www.alasu.edu,www.alasu.edu/admissions/index.aspx,www.alasu.edu/cost-aid/index.aspx,www.alasu.edu/admissions/undergrad-admissions/...,www.alasu.edu/cost-aid/forms/calculator/index....,,www.alasu.edu/search-results/index.aspx,www.alasu.edu/about-asu/the-campus/disability-...,1,1,1,Doctor's degree,1,1,11,1,1,2,2,2,12,1,A,-2,-2,-2,1,1,1,1,1,2,19.0,13.0,13.0,10.0,3.0,14.0,19.0,18.0,21.0,2,2,2,-2,-2,33860,1,-2,-2.0,1101,Montgomery County,107,-86.295677,32.364317,127,1,2018,,,,,,,,,,,,,,,


### Mapping column names from data dictionary

In [12]:
# Create a dictionary with varName and varTitle from data dictionary
col_names_mapping = dict(zip(dict_dir_2024["varName"], dict_dir_2024["varTitle"]))

# Map the column names 
directory.rename(columns = col_names_mapping, inplace=True)

### Remove irrelevant columns

In [13]:
cols_to_remove = ["General information telephone number", "Employer Identification Number", "DUNS", "Office of Postsecondary Education (OPE) ID Number", 
                  "OPE Title IV eligibility indicator code", "Institution's internet website address", "Admissions office web address", "Financial aid office web address",
                  "Online application web address", "Net price calculator web address", "Veterans and Military Servicemembers tuition policies web address", 
                  "Student-Right-to-Know student athlete graduation rate web address", "Disability Services Web Address", "Year institution was deleted from IPEDS",
                  "Date institution closed", "Unique Entity Identifier (UEI) Numbers", "Carnegie Classification 2021: Basic", "C21IPUG", "C21IPGRD", "C21UGPRF", "C21ENPRF", "C21SZSET",
                  "2000 Carnegie Classification (historical - not updated)", "Carnegie Classification 2025: Institutional Classification", 
                  "Carnegie Classification 2025: Student Access and Earnings", #"Carnegie Classification 2025: Research Activity Designation",
                  "Carnegie Classification 2025: Institutional Size", "Carnegie Classification 2025: Award Level Focus", 
                  "Carnegie Classification 2025: Undergraduate Academic Program Mix", "Carnegie Classification 2025: Graduate Academic Program Mix"
                  ]

directory = directory.drop(columns = cols_to_remove)

In [14]:
directory.head()

Unnamed: 0,Unique identification number of the institution,Institution (entity) name,Institution name alias,Street address or post office box,City location of institution,State abbreviation,ZIP code,FIPS state code,Bureau of Economic Analysis (BEA) regions,Name of chief administrator,Title of chief administrator,Sector of institution,Level of institution,Control of institution,Highest level of offering,Undergraduate offering,Graduate offering,Highest degree offered,Degree-granting status,Historically Black College or University,Institution has hospital,Institution grants a medical degree,Tribal college,Degree of urbanization (Urban-centric locale),Institution open to the general public,Status of institution,UNITID for merged schools,Institution is active in current year,Primarily postsecondary indicator,Postsecondary institution indicator,Postsecondary and Title IV institution indicator,"Reporting method for student charges, graduation rates, retention rates and student financial aid",Institutional category,C18BASIC,C18IPUG,C18IPGRD,C18UGPRF,C18ENPRF,C18SZSET,C15BASIC,CCBASIC,CARNEGIE,Land Grant Institution,Institution size category,Multi-institution or multi-campus organization,Name of multi-institution or multi-campus organization,Identification number of multi-institution or multi-campus organization,Core Based Statistical Area (CBSA),CBSA Type Metropolitan or Micropolitan,Combined Statistical Area (CSA),NECTA,Fips County code,County name,State and 118TH Congressional District ID,Longitude location of institution,Latitude location of institution,Data Feedback Report comparison group created by NCES,Data Feedback Report - Institution submitted a custom comparison group,publication_yr,Carnegie Classification 2025: Research Activity Designation
0,100654,Alabama A & M University,AAMU,4900 Meridian Street,Normal,AL,35762,1,5,"Dr. Andrew Hugine, Jr.",President,1,1,1,Doctor's degree,1,1,12,1,1,2,2,2,12,1,A,-2,1,1,1,1,1,2,18.0,16.0,17.0,10.0,4.0,14.0,18.0,18.0,16.0,1,3,2,-2,-2,26620,1,290,-2.0,1089,Madison County,105,-86.568502,34.783368,119,1,2018,
1,100663,University of Alabama at Birmingham,,Administration Bldg Suite 1070,Birmingham,AL,35294-0110,1,5,Ray L. Watts,President,1,1,1,Doctor's degree,1,1,11,1,2,1,1,2,12,1,A,-2,1,1,1,1,1,2,15.0,17.0,17.0,9.0,5.0,15.0,15.0,15.0,15.0,2,5,1,The University of Alabama System,101050,13820,1,142,-2.0,1073,Jefferson County,107,-86.799345,33.505697,105,1,2018,
2,100690,Amridge University,Southern Christian University |Regions University,1200 Taylor Rd,Montgomery,AL,36117-3553,1,5,Michael C.Turner,President,2,1,2,Doctor's degree,1,1,12,1,2,2,2,2,12,1,A,-2,1,1,1,1,1,2,20.0,19.0,18.0,5.0,5.0,6.0,20.0,21.0,51.0,2,1,2,-2,-2,33860,1,-2,-2.0,1101,Montgomery County,102,-86.17401,32.362609,137,2,2018,
3,100706,University of Alabama in Huntsville,UAH |University of Alabama Huntsville,301 Sparkman Dr,Huntsville,AL,35899,1,5,Robert A. Altenkirch,President,1,1,1,Doctor's degree,1,1,11,1,2,2,2,2,12,1,A,-2,1,1,1,1,1,2,16.0,17.0,17.0,15.0,4.0,12.0,16.0,15.0,16.0,2,3,1,The University of Alabama System,101050,26620,1,290,-2.0,1089,Madison County,105,-86.640449,34.724557,109,2,2018,
4,100724,Alabama State University,,915 S Jackson Street,Montgomery,AL,36104-0271,1,5,Quinton T. Ross,President,1,1,1,Doctor's degree,1,1,11,1,1,2,2,2,12,1,A,-2,1,1,1,1,1,2,19.0,13.0,13.0,10.0,3.0,14.0,19.0,18.0,21.0,2,2,2,-2,-2,33860,1,-2,-2.0,1101,Montgomery County,107,-86.295677,32.364317,127,1,2018,


### Export the dataset

In [15]:
# Export the dataset
directory.to_csv("../Cleaned data/institutional/cleaned_directory_information.csv", index=False)  

## Section 2: Fields of Study

This section is concerned with the fields of study pursued by students at different insitutions. This data is sourced from National Student Clearinghouse Research Center. The data source identifies institutions with vocational programs as "public two-year" and "public PABs."

### Load datasets

In [16]:
majors = pd.read_csv("../../Data/2024/CTEESpring2025-DataAppendix(1)/CIP Group Enrollment-Table 1.csv", encoding = 'latin1')
majors_by_state = pd.read_csv("../../Data/2024/CTEESpring2025-DataAppendix(1)/CIP Family Enrollment by State-Table 1.csv", encoding = 'latin1')

### Correcting column names

In [17]:
major_col_names = ["Award Level and Institution Type", "Major Field Family (2-digit CIP)", "Major Field Family (2-digit CIP) Title", 
"Major Field Group (4-digit CIP)", "Major Field Group (4-digit CIP) Title", 
"2020 Enrollment", "2021 Enrollment", "2021 % Change from Previous Year", 
"2022 Enrollment", "2022 % Change from Previous Year", 
"2023 Enrollment", "2023 % Change from Previous Year", 
"2024 Enrollment", "2024 % Change from Previous Year", 
"2025 Enrollment", "2025 % Change from Previous Year"]

major_by_state_col_names = ["Award Level and Institution Type", "State", "Region",
"Major Field Family (2-digit CIP)", "Major Field Family (2-digit CIP) Title",  
"2020 Enrollment", "2021 Enrollment", "2021 % Change from Previous Year", 
"2022 Enrollment", "2022 % Change from Previous Year", 
"2023 Enrollment", "2023 % Change from Previous Year", 
"2024 Enrollment", "2024 % Change from Previous Year", 
"2025 Enrollment", "2025 % Change from Previous Year"]  

# Map the column names 
majors.columns = major_col_names
majors_by_state.columns = major_by_state_col_names

# Remove first two rows for both
majors.drop(index = majors.index[:2], axis = 0, inplace = True)
majors_by_state.drop(index = majors_by_state.index[:2], axis = 0, inplace = True)

### Quick explorations

In [18]:
print(f"Majors: {majors.shape}")
print(f"Majors by State: {majors_by_state.shape}")

Majors: (1410, 16)
Majors by State: (7488, 16)


In [19]:
# Info of the datasets
print(majors.info())
print(majors_by_state.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1410 entries, 2 to 1411
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Award Level and Institution Type        1410 non-null   object
 1   Major Field Family (2-digit CIP)        1410 non-null   object
 2   Major Field Family (2-digit CIP) Title  1410 non-null   object
 3   Major Field Group (4-digit CIP)         1410 non-null   object
 4   Major Field Group (4-digit CIP) Title   1410 non-null   object
 5   2020 Enrollment                         1410 non-null   object
 6   2021 Enrollment                         1410 non-null   object
 7   2021 % Change from Previous Year        1160 non-null   object
 8   2022 Enrollment                         1410 non-null   object
 9   2022 % Change from Previous Year        1187 non-null   object
 10  2023 Enrollment                         1410 non-null   object
 11  2023

### Cleaning

In [20]:
# Replace *s with nulls
majors.replace('*', np.nan, inplace = True)
majors_by_state.replace('*', np.nan, inplace = True)

In [None]:
# Remove percentage signs and cols from the %Chg cols
cols_to_fix = [
    "2021 % Change from Previous Year",
    "2022 % Change from Previous Year",
    "2023 % Change from Previous Year",
    "2024 % Change from Previous Year",
    "2025 % Change from Previous Year"
]

for col in cols_to_fix:
    majors[col] = majors[col].str.replace("%", "", regex=False).str.replace(",", "", regex=False).astype(float)
    majors_by_state[col] = majors_by_state[col].str.replace("%", "", regex=False).str.replace(",", "", regex=False).astype(float)


# Replace commas
majors = majors.replace({",": ""}, regex = True)
majors_by_state = majors_by_state.replace({",": ""}, regex = True)

In [22]:
# change dtype of the cols
major_type_dict = {"2020 Enrollment": "Int64", "2021 Enrollment": "Int64", "2022 Enrollment": "Int64", "2023 Enrollment": "Int64", "2024 Enrollment": "Int64", "2025 Enrollment": "Int64",
                "2021 % Change from Previous Year": float, "2022 % Change from Previous Year": float, "2023 % Change from Previous Year": float, 
                "2024 % Change from Previous Year": float, "2025 % Change from Previous Year": float}

majors = majors.astype(major_type_dict)
majors_by_state = majors_by_state.astype(major_type_dict)

In [23]:
majors.head(1)

Unnamed: 0,Award Level and Institution Type,Major Field Family (2-digit CIP),Major Field Family (2-digit CIP) Title,Major Field Group (4-digit CIP),Major Field Group (4-digit CIP) Title,2020 Enrollment,2021 Enrollment,2021 % Change from Previous Year,2022 Enrollment,2022 % Change from Previous Year,2023 Enrollment,2023 % Change from Previous Year,2024 Enrollment,2024 % Change from Previous Year,2025 Enrollment,2025 % Change from Previous Year
2,Undergraduate 4-year,CIP Missing,CIP Missing,Total,Total,306775,307708,0.3,310403,0.9,393153,26.7,326145,-17.0,337969,3.6


In [82]:
majors_by_state.head(1)

Unnamed: 0,Award Level and Institution Type,State,Region,Major Field Family (2-digit CIP),Major Field Family (2-digit CIP) Title,2020 Enrollment,2021 Enrollment,2021 % Change from Previous Year,2022 Enrollment,2022 % Change from Previous Year,2023 Enrollment,2023 % Change from Previous Year,2024 Enrollment,2024 % Change from Previous Year,2025 Enrollment,2025 % Change from Previous Year
2,Undergraduate 4-year,Alabama,South,CIP Missing,CIP Missing,2865,2315,-19.2,2494,7.7,3608,44.7,3926,8.8,4036,2.8


In [80]:
majors["Award Level and Institution Type"].value_counts()

Award Level and Institution Type
Undergraduate 4-year    470
Undergraduate PAB       470
Undergraduate 2-year    470
Name: count, dtype: int64

In [81]:
majors[majors["Award Level and Institution Type"] == "Undergraduate 2-year"]["Major Field Family (2-digit CIP) Title"].value_counts()

Major Field Family (2-digit CIP) Title
Multi/Interdisciplinary Studies                                 50
Engineering                                                     42
Health Professions and Related Clinical Sciences                35
Business Management Marketing and Related Support               23
Agriculture Agriculture Operations and Related Sciences         20
Engineering Technologies/Technicians                            20
Foreign Languages Literatures and Linguistics                   19
Education                                                       16
Biological and Biomedical Sciences                              16
Social Sciences                                                 15
Computer and Information Sciences and Support Services          12
Visual and Performing Arts                                      12
Family and Consumer Sciences/Human Sciences                     11
Architecture and Related Services                               10
Physical Sciences      

### Export cleaned datasets

In [25]:
# Export the two datasets
majors.to_csv("../Cleaned data/institutional/majors.csv", index=False)
majors_by_state.to_csv("../Cleaned data/institutional/majors_by_state.csv", index=False)  

## Section 3: Graduation Rates

Dataset cleaned: "Institutional Characteristics -- Graduation Rates" from Integrated Postsecondary Education Data System (IPEDS) via the National Center for Education Statistics. A data dictionary, obtained from the same source, was used to understand the names and definitions of the features.

### Load datasets and quick exploration

In [26]:
# Load the datasets
grad_2018 = pd.read_csv("../../Data/2024/Graduation-Rates-2018.csv", encoding = 'latin1')
grad_2019 = pd.read_csv("../../Data/2024/Graduation-Rates-2019.csv", encoding = 'latin1')
grad_2020 = pd.read_csv("../../Data/2024/Graduation-Rates-2020.csv", encoding = 'latin1')
grad_2021 = pd.read_csv("../../Data/2024/Graduation-Rates-2021.csv", encoding = 'latin1')
grad_2022 = pd.read_csv("../../Data/2024/Graduation-Rates-2022.csv", encoding = 'latin1')
grad_2023 = pd.read_csv("../../Data/2024/Graduation-Rates-2023.csv", encoding = 'latin1')

In [27]:
# Shape of the dataframes
print(f"Shape of 2018 Dataframe:{grad_2018.shape}")
print(f"Shape of 2019 Dataframe:{grad_2019.shape}")
print(f"Shape of 2020 Dataframe:{grad_2020.shape}")
print(f"Shape of 2021 Dataframe:{grad_2021.shape}")
print(f"Shape of 2022 Dataframe:{grad_2022.shape}")
print(f"Shape of 2023 Dataframe:{grad_2023.shape}")

Shape of 2018 Dataframe:(51906, 66)
Shape of 2019 Dataframe:(50493, 66)
Shape of 2020 Dataframe:(50103, 66)
Shape of 2021 Dataframe:(49547, 66)
Shape of 2022 Dataframe:(48217, 66)
Shape of 2023 Dataframe:(51368, 66)


In [28]:
# Replace "empty" cells with NaN
grad_2018 = grad_2018.replace(r'^\s*$', np.nan, regex=True)
grad_2019 = grad_2019.replace(r'^\s*$', np.nan, regex=True)
grad_2020 = grad_2020.replace(r'^\s*$', np.nan, regex=True)
grad_2021 = grad_2021.replace(r'^\s*$', np.nan, regex=True)
grad_2022 = grad_2022.replace(r'^\s*$', np.nan, regex=True)
grad_2023 = grad_2023.replace(r'^\s*$', np.nan, regex=True)

### Join the datasets together

In [29]:
# Create a dictionary of the dataframes corresponding to their years of publication
grads = {
    2018: grad_2018, 
    2019: grad_2019,
    2020: grad_2020,
    2021: grad_2021,
    2022: grad_2022,
    2023: grad_2023
} 

# Initialize empty list
grads_dfs = []

for yr, df in grads.items():

    # Create a copy of each dataframe
    grads_copy = df.copy()

    # Add a publication year column
    grads_copy["publication_yr"] = yr

    grads_dfs.append(grads_copy)

# Concatenate the datasts
grad = pd.concat(grads_dfs, ignore_index = True, sort = False)

# Print preview of the new dataset
grad.shape

(301634, 67)

In [30]:
grad.head()

Unnamed: 0,UNITID,GRTYPE,CHRTSTAT,SECTION,COHORT,LINE,XGRTOTLT,GRTOTLT,XGRTOTLM,GRTOTLM,XGRTOTLW,GRTOTLW,XGRAIANT,GRAIANT,XGRAIANM,GRAIANM,XGRAIANW,GRAIANW,XGRASIAT,GRASIAT,XGRASIAM,GRASIAM,XGRASIAW,GRASIAW,XGRBKAAT,GRBKAAT,XGRBKAAM,GRBKAAM,XGRBKAAW,GRBKAAW,XGRHISPT,GRHISPT,XGRHISPM,GRHISPM,XGRHISPW,GRHISPW,XGRNHPIT,GRNHPIT,XGRNHPIM,GRNHPIM,XGRNHPIW,GRNHPIW,XGRWHITT,GRWHITT,XGRWHITM,GRWHITM,XGRWHITW,GRWHITW,XGR2MORT,GR2MORT,XGR2MORM,GR2MORM,XGR2MORW,GR2MORW,XGRUNKNT,GRUNKNT,XGRUNKNM,GRUNKNM,XGRUNKNW,GRUNKNW,XGRNRALT,GRNRALT,XGRNRALM,GRNRALM,XGRNRALW,GRNRALW,publication_yr
0,100654,2,12,1,1,999,R,756,R,371.0,R,385.0,R,0.0,R,0.0,R,0.0,R,0.0,R,0.0,R,0.0,R,731.0,R,363.0,R,368.0,R,4.0,R,1.0,R,3.0,R,1.0,R,0.0,R,1.0,R,8.0,R,3.0,R,5.0,R,4.0,R,1.0,R,3.0,R,8.0,R,3.0,R,5.0,R,0.0,R,0.0,R,0.0,2018
1,100654,3,13,1,1,999,R,203,R,85.0,R,118.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,R,196.0,R,82.0,R,114.0,R,1.0,Z,0.0,R,1.0,Z,0.0,Z,0.0,Z,0.0,R,2.0,R,1.0,R,1.0,R,1.0,R,1.0,Z,0.0,R,3.0,R,1.0,R,2.0,Z,0.0,Z,0.0,Z,0.0,2018
2,100654,4,20,1,1,999,R,318,R,154.0,R,164.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,R,302.0,R,149.0,R,153.0,R,3.0,R,1.0,R,2.0,R,1.0,Z,0.0,R,1.0,R,5.0,R,2.0,R,3.0,R,3.0,Z,0.0,R,3.0,R,4.0,R,2.0,R,2.0,Z,0.0,Z,0.0,Z,0.0,2018
3,100654,6,10,2,2,10,R,757,R,372.0,R,385.0,R,0.0,R,0.0,R,0.0,R,0.0,R,0.0,R,0.0,R,732.0,R,364.0,R,368.0,R,4.0,R,1.0,R,3.0,R,1.0,R,0.0,R,1.0,R,8.0,R,3.0,R,5.0,R,4.0,R,1.0,R,3.0,R,8.0,R,3.0,R,5.0,R,0.0,R,0.0,R,0.0,2018
4,100654,7,11,2,2,45,R,1,R,1.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,R,1.0,R,1.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,Z,0.0,2018


### Mapping some of the categorical values

In [31]:
# Load the mapping file from the data dictionary provided by the data source
cat_map = pd.read_csv("../../Data/2024/Graduation-Rates-Dictionary/FrequenciesRV-Table 1.csv", encoding = 'latin1')

In [32]:
cat_map.head()

Unnamed: 0,varnumber,varname,codevalue,valuelabel,frequency,percent
0,80176,GRTYPE,40,Total exclusions 4-year schools,598,1.24
1,80176,GRTYPE,2,"4-year institutions, Adjusted cohort (revised ...",2288,4.75
2,80176,GRTYPE,3,"4-year institutions, Completers within 150% of...",2242,4.65
3,80176,GRTYPE,4,"4-year institutions, Transfer-out students",1446,3.0
4,80176,GRTYPE,41,"4-year institutions, noncompleters still enrolled",1597,3.31


In [33]:
# use FrequenciesRV
grad_mapped = grad.copy()

og_cols_to_map = ["GRTYPE", "CHRTSTAT", "SECTION", "COHORT", "LINE"] 

for col in og_cols_to_map:
    # Create subset of mapping dictionary FrequenciesRV and convert dtype to string
    map_subset = cat_map[cat_map["varname"] == col][["codevalue", "valuelabel"]]
    map_subset["codevalue"] = map_subset["codevalue"].astype(str)

    # Convert dtype of column at hand to string as well
    grad_mapped[col] = grad_mapped[col].astype(str)

    # Mapping process
    grad_mapping = dict(zip(map_subset["codevalue"], map_subset["valuelabel"]))
    grad_mapped[col] = grad_mapped[col].map(grad_mapping)

### Mapping column names to more user-friendly names

In [34]:
# Load the data dictionary
grad_dict = pd.read_csv("../../Data/2024/Graduation-Rates-Dictionary/Varlist-Table 1.csv", encoding = 'latin1')

In [35]:
# Create a dictionary with varName and varTitle from data dictionary
grad_col_names = dict(zip(grad_dict["varname"], grad_dict["varTitle"]))

# Map the column names 
grad_mapped.rename(columns = grad_col_names, inplace=True)

### Remove irrelevant columns

In [36]:
# get rid of "X..." cols because they are imputation cols and are irrelevant
gcols_to_remove = ["XGRTOTLT", "XGRTOTLM", "XGRTOTLW", "XGRAIANT", "XGRAIANM", "XGRAIANW", "XGRASIAT", "XGRASIAM", "XGRASIAW", "XGRBKAAT", "XGRBKAAM", "XGRBKAAW", "XGRHISPT", 
"XGRHISPM", "XGRHISPW", "XGRNHPIT", "XGRNHPIM", "XGRNHPIW", "XGRWHITT", "XGRWHITM", "XGRWHITW", "XGR2MORT", "XGR2MORM", "XGR2MORW", "XGRUNKNT", "XGRUNKNM", "XGRUNKNW", "XGRNRALT", 
"XGRNRALM", "XGRNRALW"]

grad_mapped = grad_mapped.drop(columns = gcols_to_remove)

In [76]:
grad_mapped.isnull().values.any()

np.True_

In [37]:
grad_mapped.head()

Unnamed: 0,Unique identification number of the institution,Cohort data,Graduation rate status in cohort,Section of survey form,Cohort,Original line number of survey form,Grand total,Total men,Total women,American Indian or Alaska Native total,American Indian or Alaska Native men,American Indian or Alaska Native women,Asian total,Asian men,Asian women,Black or African American total,Black or African American men,Black or African American women,Hispanic total,Hispanic men,Hispanic women,Native Hawaiian or Other Pacific Islander total,Native Hawaiian or Other Pacific Islander men,Native Hawaiian or Other Pacific Islander women,White total,White men,White women,Two or more races total,Two or more races men,Two or more races women,Race/ethnicity unknown total,Race/ethnicity unknown men,Race/ethnicity unknown women,U.S. Nonresident total,U.S. Nonresident men,GRNRALW,publication_yr
0,100654,"4-year institutions, Adjusted cohort (revised ...",Adjusted cohort (revised cohort minus exclusions),Bachelor's/ equiv + other degree/certif-seeki...,Bachelor's/ equiv + other degree/certif-seeki...,Generated record not on original survey form,756,371.0,385.0,0.0,0.0,0.0,0.0,0.0,0.0,731.0,363.0,368.0,4.0,1.0,3.0,1.0,0.0,1.0,8.0,3.0,5.0,4.0,1.0,3.0,8.0,3.0,5.0,0.0,0.0,0.0,2018
1,100654,"4-year institutions, Completers within 150% of...",Completers within 150% of normal time,Bachelor's/ equiv + other degree/certif-seeki...,Bachelor's/ equiv + other degree/certif-seeki...,Generated record not on original survey form,203,85.0,118.0,0.0,0.0,0.0,0.0,0.0,0.0,196.0,82.0,114.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,3.0,1.0,2.0,0.0,0.0,0.0,2018
2,100654,"4-year institutions, Transfer-out students",Transfer-out students,Bachelor's/ equiv + other degree/certif-seeki...,Bachelor's/ equiv + other degree/certif-seeki...,Generated record not on original survey form,318,154.0,164.0,0.0,0.0,0.0,0.0,0.0,0.0,302.0,149.0,153.0,3.0,1.0,2.0,1.0,0.0,1.0,5.0,2.0,3.0,3.0,0.0,3.0,4.0,2.0,2.0,0.0,0.0,0.0,2018
3,100654,Bachelor's or equiv subcohort (4-yr institution),Revised cohort,Bachelor's or equiv 2016 subcohort (4-yr insti...,Bachelor's or equiv 2016 subcohort (4-yr inst...,,757,372.0,385.0,0.0,0.0,0.0,0.0,0.0,0.0,732.0,364.0,368.0,4.0,1.0,3.0,1.0,0.0,1.0,8.0,3.0,5.0,4.0,1.0,3.0,8.0,3.0,5.0,0.0,0.0,0.0,2018
4,100654,Bachelor's or equiv subcohort (4-yr institutio...,Exclusions,Bachelor's or equiv 2016 subcohort (4-yr insti...,Bachelor's or equiv 2016 subcohort (4-yr inst...,,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018


### Export dataset

In [38]:
grad_mapped.to_csv("../Cleaned data/institutional/graduation-rates.csv", index=False)

## Section 4: Total Cost & Attendance

In [39]:
# Load datasets
cost_23 = pd.read_csv("../../Data/2024/Total-Cost-Attendance-22-23.csv", encoding = 'latin1')
cost_22 = pd.read_csv("../../Data/2024/Total-Cost-Attendance-23-24.csv", encoding = 'latin1')

In [50]:
print(f"Shape of 2022 Dataframe:{cost_22.shape}")
print(f"Shape of 2023 Dataframe:{cost_23.shape}")

Shape of 2022 Dataframe:(3419, 14)
Shape of 2023 Dataframe:(3465, 14)


In [40]:
cost_23.head()

Unnamed: 0,UNITID,TUFEYR0,TUFEYR1,TUFEYR2,TUFEYR3,CINDON,CINSON,COTSON,CINDOFF,CINSOFF,COTSOFF,CINDFAM,CINSFAM,COTSFAM
0,100654,10024,10024,10024,10024,24234,24234,32844,24234,24234,32844,15064,15064,23674
1,100663,8568,8568,8568,8832,28909,28909,41293,30369,30369,42753,16039,16039,28423
2,100706,11122,11338,11488,11878,29878,29878,42770,29878,29878,42770,18756,18756,31648
3,100724,11068,11068,11068,11068,22110,22110,30438,25118,25118,33446,17798,17798,26126
4,100751,10780,11620,11620,11940,32024,32024,52384,32024,32024,52384,18768,18768,39128


### Map column names to more user-friendly ones using data dictionary

In [43]:
# Load most recent data dictionaries
cost_dict_22 = pd.read_csv("../../Data/2024/Total-Cost-Attendance-Dict-22/varlist-Table 1.csv", encoding = 'latin1')
cost_dict_23 = pd.read_csv("../../Data/2024/Total-Cost-Attendance-Dict-23/varlist-Table 1.csv", encoding = 'latin1')

In [66]:
# Create a dictionary with varName and varTitle from data dictionary
cost_col_names_23 = dict(zip(cost_dict_23["varName"], cost_dict_23["varTitle"]))
cost_col_names_22 = dict(zip(cost_dict_22["varname"], cost_dict_22["varTitle"]))

# Map the column names 
cost_23.rename(columns = cost_col_names_23, inplace = True)
cost_22.rename(columns = cost_col_names_22, inplace = True)

cost_23.columns = cost_23.columns.str.replace('\n', '')
cost_22.columns = cost_22.columns.str.replace('\n', '')

# Rename COTSFAM col isnce that didn't get mapped in the above method
cost_23.rename(columns = {"COTSFAM ": "Total price for out-of-state students living off campus (with family)  2022-23"}, inplace = True)
cost_22.rename(columns = {"COTSFAM ": "Total price for out-of-state students living off campus (with family)  2022-23"}, inplace = True)

In [67]:
cost_23.head()

Unnamed: 0,Unique identification number of the institution,"Tuition and fees, 2020-21","Tuition and fees, 2021-22","Tuition and fees, 2022-23","Tuition and fees, 2023-24",Total price for in-district students living on campus 2023-24,Total price for in-state students living on campus 2023-24,Total price for out-of-state students living on campus 2023-24,Total price for in-district students living off campus (not with family) 2023-24,Total price for in-state students living off campus (not with family) 2023-24,Total price for out-of-state students living off campus (not with family) 2023-24,Total price for in-district students living off campus (with family) 2023-24,Total price for in-state students living off campus (with family) 2023-24,Total price for out-of-state students living off campus (with family) 2022-23
0,100654,10024,10024,10024,10024,24234,24234,32844,24234,24234,32844,15064,15064,23674
1,100663,8568,8568,8568,8832,28909,28909,41293,30369,30369,42753,16039,16039,28423
2,100706,11122,11338,11488,11878,29878,29878,42770,29878,29878,42770,18756,18756,31648
3,100724,11068,11068,11068,11068,22110,22110,30438,25118,25118,33446,17798,17798,26126
4,100751,10780,11620,11620,11940,32024,32024,52384,32024,32024,52384,18768,18768,39128


In [68]:
cost_22.head()

Unnamed: 0,Unique identification number of the institution,"Tuition and fees, 2019-20","Tuition and fees, 2020-21","Tuition and fees, 2021-22","Tuition and fees, 2022-23",Total price for in-district students living on campus 2022-23,Total price for in-state students living on campus 2022-23,Total price for out-of-state students living on campus 2022-23,Total price for in-district students living off campus (not with family) 2022-23,Total price for in-state students living off campus (not with family) 2022-23,Total price for out-of-state students living off campus (not with family) 2022-23,Total price for in-district students living off campus (with family) 2022-23,Total price for in-state students living off campus (with family) 2022-23,Total price for out-of-state students living off campus (with family) 2022-23
0,100654,10024,10024,10024,10024,27482,27482,36092,27482,27482,36092,16487,16487,25097
1,100663,8568,8568,8832,8832,29379,29379,42411,30619,30619,43651,16039,16039,29071
2,100706,11338,11488,11878,11770,29770,29770,42662,29770,29770,42662,18648,18648,31540
3,100724,11068,11068,11068,11248,23930,23930,32258,25298,25298,33626,17978,17978,26306
4,100751,11620,11620,11940,11900,33382,33382,54682,33862,33862,55162,19879,19879,41179


### Combine the datasets

In [69]:
# Subset of cost_22 dataset to have only columns needed to merge
cost_22_subset = cost_22[["Unique identification number of the institution", "Tuition and fees, 2019-20"]]

# Merging datasets
costs = pd.merge(cost_23, cost_22_subset, on= "Unique identification number of the institution")

# Reorder the columns so the newly added column is correctly organized within the dataset
col_list = costs.columns.tolist()
col_list.insert(1, col_list.pop(col_list.index("Tuition and fees, 2019-20")))
costs = costs[col_list]

In [70]:
costs.head()

Unnamed: 0,Unique identification number of the institution,"Tuition and fees, 2019-20","Tuition and fees, 2020-21","Tuition and fees, 2021-22","Tuition and fees, 2022-23","Tuition and fees, 2023-24",Total price for in-district students living on campus 2023-24,Total price for in-state students living on campus 2023-24,Total price for out-of-state students living on campus 2023-24,Total price for in-district students living off campus (not with family) 2023-24,Total price for in-state students living off campus (not with family) 2023-24,Total price for out-of-state students living off campus (not with family) 2023-24,Total price for in-district students living off campus (with family) 2023-24,Total price for in-state students living off campus (with family) 2023-24,Total price for out-of-state students living off campus (with family) 2022-23
0,100654,10024,10024,10024,10024,10024,24234,24234,32844,24234,24234,32844,15064,15064,23674
1,100663,8568,8568,8568,8568,8832,28909,28909,41293,30369,30369,42753,16039,16039,28423
2,100706,11338,11122,11338,11488,11878,29878,29878,42770,29878,29878,42770,18756,18756,31648
3,100724,11068,11068,11068,11068,11068,22110,22110,30438,25118,25118,33446,17798,17798,26126
4,100751,11620,10780,11620,11620,11940,32024,32024,52384,32024,32024,52384,18768,18768,39128


### Deduplication check

In [None]:
# Dedup logic
for i in costs["Unique identification number of the institution"].unique():
    i_count = (costs["Unique identification number of the institution"] == i).sum()
    if i_count > 2:
        print(f"There are duplicate institution IDs. Note the following UNIT IDs: {i}")

In [75]:
costs.isnull().values.any()

np.False_

### Export dataset

In [77]:
costs.to_csv("../Cleaned data/institutional/cost-of-attendance.csv", index=False)

## Section 8: Student Charges for Vocational Schools

## Section 6: Financial Aid & Net Price

## Section 4: Awards Demographics