In [2]:
import requests
from tqdm import tqdm
import os
import zipfile
import pandas as pd

In [3]:
# US News Historical Data
usnews = "https://andyreiter.com/wp-content/uploads/2022/09/US-News-Rankings-Universities-Through-2023.xlsx"
r = requests.get(usnews, allow_redirects=True)
open('./downloads/usnews.xlsx', 'wb').write(r.content)

37656

#### Initial feature selection

**Features discussed in Reed paper and corresponding tables in 2021:**
- Rank (US News historical data)
- Peer Assessment Score (Historical/Twitter/Page-Rank)
- School (HD2021)
- State (HD2021)
- Public/Private (HD2021)
- Average Freshman Retention Rate (EF2021D)
- Student/Faculty ratio (EF2021D)
- Actual Graduation Rate (GR2021)
- Graduation Rate among PELL recipients (GR2021_PELL_SSL)
- % of Full-time Faculty (EAP2021)
- Faculty salaries (SAL2021_IS)
- SAT/ACT 25th-75th percentile (ADM2021)
- Acceptance Rate (ADM2021)
- Expenditure per FTE student (F2021_F1A, F2021_F2)
- Endowment size per FTE student (F2021_F1A, F2021_F2)

<br>  

**IPEDS Tables to download in 2021:**
- HD2021
- EF2021D
- GR2021
- GR2021_PELL_SSL
- EAP2021
- SAL2021_IS
- ADM2021
- F2021_F1A
- F2021_F2

In [4]:
start_year = 2011
end_year = 2021
IPEDS_files = {}
for current_year in range(start_year, end_year + 1):
    current_files = [
        f"HD{current_year}.zip",
        f"EF{current_year}D.zip",
        f"GR{current_year}.zip",
        f"GR{current_year}_PELL_SSL.zip",
        f"EAP{current_year}.zip",
        f"SAL{current_year}_IS.zip",
        f"ADM{current_year}.zip",
        f"F{current_year-2001}{current_year-2000}_F1A.zip",
        f"F{current_year-2001}{current_year-2000}_F2.zip",
    ]
    IPEDS_files[current_year] = current_files

#### IPEDS File Downloading

In [312]:
url_prefix = "https://nces.ed.gov/ipeds/datacenter/data/"
for current_year in tqdm(range(start_year, end_year + 1), desc="Years"):
    year_dir = f"./downloads/{current_year}/"
    dir_exists = os.path.exists(year_dir)
    if not dir_exists:
        os.makedirs(year_dir)
    for file_name in tqdm(IPEDS_files[current_year], desc=f"{current_year} files"):
        file_path = year_dir + file_name
        if not os.path.exists(file_path):
            r = requests.get(url_prefix + file_name, allow_redirects=True)
            if not r.status_code == 404:
                open(year_dir + file_name, 'wb').write(r.content)

2011 files: 100%|██████████| 9/9 [01:39<00:00, 11.03s/it]
2012 files: 100%|██████████| 9/9 [01:38<00:00, 10.97s/it]
2013 files: 100%|██████████| 9/9 [00:38<00:00,  4.23s/it]
2014 files: 100%|██████████| 9/9 [00:45<00:00,  5.11s/it]
2015 files: 100%|██████████| 9/9 [00:51<00:00,  5.71s/it]
2016 files: 100%|██████████| 9/9 [00:00<00:00, 3198.23it/s]
2017 files: 100%|██████████| 9/9 [00:00<00:00, 3851.13it/s]
2018 files: 100%|██████████| 9/9 [00:00<00:00, 3533.53it/s]
2019 files: 100%|██████████| 9/9 [00:00<00:00, 7915.44it/s]
2020 files: 100%|██████████| 9/9 [00:00<00:00, 5139.38it/s]
2021 files: 100%|██████████| 9/9 [00:00<00:00, 7002.18it/s]
Years: 100%|██████████| 11/11 [05:33<00:00, 30.31s/it]


#### IPEDS File Unzipping

In [313]:
for current_year in tqdm(range(start_year, end_year + 1), desc="Years"):
    year_dir = f"./downloads/{current_year}/"
    for zip_file_name in tqdm(IPEDS_files[current_year], desc=f"{current_year} files"):
        csv_file_name = zip_file_name.split(".")[0].lower() + ".csv"
        if (not os.path.exists(year_dir + csv_file_name)) and os.path.exists(year_dir + zip_file_name):
            with zipfile.ZipFile(year_dir + zip_file_name, 'r') as zip_ref:
                zip_ref.extractall(year_dir)

2011 files: 100%|██████████| 9/9 [00:00<00:00, 50.15it/s]
2012 files: 100%|██████████| 9/9 [00:00<00:00, 48.58it/s]
2013 files: 100%|██████████| 9/9 [00:00<00:00, 39.83it/s]
2014 files: 100%|██████████| 9/9 [00:00<00:00, 39.01it/s]
2015 files: 100%|██████████| 9/9 [00:00<00:00, 41.05it/s]
2016 files: 100%|██████████| 9/9 [00:00<00:00, 3751.61it/s]
2017 files: 100%|██████████| 9/9 [00:00<00:00, 5836.23it/s]
2018 files: 100%|██████████| 9/9 [00:00<00:00, 4021.81it/s]
2019 files: 100%|██████████| 9/9 [00:00<00:00, 5905.62it/s]
2020 files: 100%|██████████| 9/9 [00:00<00:00, 11349.59it/s]
2021 files: 100%|██████████| 9/9 [00:00<00:00, 7252.40it/s]
Years: 100%|██████████| 11/11 [00:01<00:00, 10.21it/s]


#### IPEDS Feature selection

In [5]:
def latest_csv(file_name_no_ext, csv_set):
    csv_name = file_name_no_ext + ".csv"
    csv_revised_name = file_name_no_ext + "_rv.csv"
    if csv_name not in csv_set:
        return None
    if csv_revised_name in csv_set:
        csv_name = csv_revised_name
    return csv_name

In [6]:
def get_csv_mapping(year_dir, zip_list):
    extracted_csv_files = set(filter(lambda x: x.endswith(".csv"),os.listdir(year_dir)))
    file_name_no_ext_list = [file_name.split(".")[0].lower() for file_name in zip_list]
    latest_csv_list = [latest_csv(file_name, extracted_csv_files) for file_name in file_name_no_ext_list]
    csv_path_list = [year_dir + csv_name if csv_name else None for csv_name in latest_csv_list]

    return {
        "HD": csv_path_list[0],
        "EF": csv_path_list[1],
        "GR": csv_path_list[2],
        "GR_PELL": csv_path_list[3],
        "EAP": csv_path_list[4],
        "SAL": csv_path_list[5],
        "ADM": csv_path_list[6],
        "F1A": csv_path_list[7],
        "F2": csv_path_list[8],
    }

In [20]:
usnews = pd.read_excel("./downloads/usnews.xlsx", header=1).rename(columns={"IPEDS ID": "UNITID"})
for current_year in tqdm(range(start_year, end_year + 1), desc="Years"):
    year_dir = f"./downloads/{current_year}/"
    csv_files = set(filter(lambda x: x.endswith(".csv"),os.listdir(year_dir)))
    csv_mapping = get_csv_mapping(year_dir, IPEDS_files[current_year])
    
    year_df = usnews[["UNITID", current_year]].rename(columns={current_year: "USNEWSRANK"})
    
    HD = pd.read_csv(csv_mapping["HD"], encoding='latin-1')
    HD = HD[["UNITID", "INSTNM", "IALIAS", "COUNTYNM", "CITY", "STABBR", "ZIP", "WEBADDR", "CONTROL"]]
    year_df = pd.merge(year_df, HD, on='UNITID')

    EF = pd.read_csv(csv_mapping["EF"], encoding='latin-1').rename(columns=lambda x: x.strip())
    EF = EF[["UNITID", "RET_PCF", "STUFACR", "RRFTCT"]].rename(columns={"RET_PCF": "RETENTION", "RRFTCT": "FTCT"})
    year_df = pd.merge(year_df, EF, on='UNITID')

    GR = pd.read_csv(csv_mapping["GR"], encoding='latin-1')
    totals = GR[(GR["CHRTSTAT"] == 12) & (GR["GRTYPE"] == 2)][["UNITID", "GRTOTLT"]].rename(columns={"GRTOTLT": "COHORT"})
    grads = GR[(GR["CHRTSTAT"] == 13) & (GR["GRTYPE"] == 3)][["UNITID", "GRTOTLT"]].rename(columns={"GRTOTLT": "GRADS"})
    GR = pd.merge(totals, grads, on='UNITID')
    GR["GRRATE"] = GR["GRADS"] / GR["COHORT"]
    year_df = pd.merge(year_df, GR, on='UNITID')

    if not csv_mapping["GR_PELL"] is None:
        GR_PELL = pd.read_csv(csv_mapping["GR_PELL"], encoding='latin-1')
        GR_PELL = GR_PELL[GR_PELL["PSGRTYPE"] == 2][["UNITID", "PGADJCT", "PGCMBAC"]].rename(columns={"PGADJCT": "PELLCOHORT", "PGCMBAC": "PELLGRADS",})
        GR_PELL["PELLGRRATE"] = GR_PELL["PELLGRADS"] / GR_PELL["PELLCOHORT"]
        year_df = pd.merge(year_df, GR_PELL, on='UNITID')

    if not csv_mapping["EAP"] is None:
        EAP = pd.read_csv(csv_mapping["EAP"], encoding='latin-1').rename(columns=lambda x: x.strip())
        if "FACSTAT" in EAP.columns:
            EAP = EAP[(EAP["FACSTAT"] == 10) & (EAP["OCCUPCAT"] == 100)][["UNITID", "EAPFT", "EAPTOT"]]
            EAP = EAP.rename(columns={"EAPFT": "FACFT", "EAPTOT": "FACTOT"})
        else:
            EAP = EAP[(EAP["FSTAT"] == 1) & (EAP["FUNCTCD"] == 10)][["UNITID", "FTPT", "EAPTOT"]]
            FULLTIME = EAP[EAP["FTPT"] == 2].rename(columns={"EAPTOT": "FACFT"}).drop(columns="FTPT")
            OVERALL = EAP[EAP["FTPT"] == 1].rename(columns={"EAPTOT": "FACTOT"}).drop(columns="FTPT")
            EAP = pd.merge(FULLTIME, OVERALL, on='UNITID')
        EAP["FTPCT"] = EAP["FACFT"] / EAP["FACTOT"]
        year_df = pd.merge(year_df, EAP, on='UNITID')
    
    if not csv_mapping["SAL"] is None:
        SAL = pd.read_csv(csv_mapping["SAL"], encoding='latin-1')
        SAL = SAL[SAL["ARANK"] == 7]
        if "SAEQ9AT" in SAL.columns:
            SAL = SAL[["UNITID", "SAEQ9AT"]].rename(columns={"SAEQ9AT":"AVGSAL"})
        else:
            SAL["AVGSAL"] = SAL["SAAVMNT"] * 9
            SAL = SAL[["UNITID", "AVGSAL"]]
        year_df = pd.merge(year_df, SAL, on='UNITID')
    
    if not csv_mapping["ADM"] is None:
        ADM = pd.read_csv(csv_mapping["ADM"], encoding='latin-1')
        ADM = ADM[["UNITID", "SATVR25", "SATVR75", "SATMT25", "SATMT75", "ACTCM25", "ACTCM75", "ADMSSN", "APPLCN"]]
        ADM["ACPTRT"] = ADM["ADMSSN"] / ADM["APPLCN"]
        year_df = pd.merge(year_df, ADM, on='UNITID')
    
    F1A = pd.read_csv(csv_mapping["F1A"], encoding='latin-1').rename(columns=lambda x: x.strip())
    F1A = F1A[["UNITID", "F1C011", "F1C021", "F1C031", "F1C051", "F1C061", "F1C071", "F1H02"]].rename(columns={
        "F1C011": "EINSTRUCTIONAL",
        "F1C021": "ERESEARCH",
        "F1C031": "EPUBLIC",
        "F1C051": "EACADEMIC",
        "F1C061": "ESTUDENT",
        "F1C071": "EINSTITUTIONAL",
        "F1H02": "ENDOWMENT"
    })
    F2 = pd.read_csv(csv_mapping["F2"], encoding='latin-1').rename(columns=lambda x: x.strip())
    F2 = F2[["UNITID", "F2E011", "F2E021", "F2E031", "F2E041", "F2E051", "F2E061", "F2H02"]].rename(columns={
        "F2E011": "EINSTRUCTIONAL",
        "F2E021": "ERESEARCH",
        "F2E031": "EPUBLIC",
        "F2E041": "EACADEMIC",
        "F2E051": "ESTUDENT",
        "F2E061": "EINSTITUTIONAL",
        "F2H02": "ENDOWMENT"
    })
    FINANCE = pd.concat([F1A, F2], ignore_index=True, axis=0)
    year_df = pd.merge(year_df, FINANCE, on='UNITID')
    year_df["EINSTRUCTIONAL"] = year_df["EINSTRUCTIONAL"] / year_df["FTCT"]
    year_df["ERESEARCH"] = year_df["ERESEARCH"] / year_df["FTCT"]
    year_df["EPUBLIC"] = year_df["EPUBLIC"] / year_df["FTCT"]
    year_df["EACADEMIC"] = year_df["EACADEMIC"] / year_df["FTCT"]
    year_df["ESTUDENT"] = year_df["ESTUDENT"] / year_df["FTCT"]
    year_df["EINSTITUTIONAL"] = year_df["EINSTITUTIONAL"] / year_df["FTCT"]
    year_df["ENDOWMENT"] = year_df["ENDOWMENT"] / year_df["FTCT"]

    output_path = f"./by_year/{current_year}.csv"
    year_df.to_csv(output_path, index=False)

Years: 100%|██████████| 11/11 [00:10<00:00,  1.05it/s]


In [22]:
year_df

Unnamed: 0,UNITID,USNEWSRANK,INSTNM,IALIAS,COUNTYNM,CITY,STABBR,ZIP,WEBADDR,CONTROL,...,ADMSSN,APPLCN,ACPTRT,EINSTRUCTIONAL,ERESEARCH,EPUBLIC,EACADEMIC,ESTUDENT,EINSTITUTIONAL,ENDOWMENT
0,186131,1.0,Princeton University,,Mercer County,Princeton,NJ,08544-0070,www.princeton.edu/,2,...,1647.0,37601,0.043802,4.875785e+05,3.059520e+05,8714.659686,1.834974e+05,109024.432810,245560.209424,3.230929e+07
1,166683,4.0,Massachusetts Institute of Technology,"MIT, M.I.T.",Middlesex County,Cambridge,MA,02139-4307,web.mit.edu/,2,...,1365.0,33240,0.041065,8.403327e+05,1.550865e+06,961.682243,3.678280e+05,78700.000000,617193.457944,2.560191e+07
2,166027,2.0,Harvard University,,Middlesex County,Cambridge,MA,02138,www.harvard.edu/,2,...,2318.0,57786,0.040114,8.295239e+05,8.320778e+05,0.000000,6.845689e+05,122858.672377,597399.714490,3.794843e+07
3,130794,4.0,Yale University,,New Haven County,New Haven,CT,06520,https://www.yale.edu/,2,...,2509.0,47240,0.053112,7.859823e+05,5.333346e+05,73626.544304,1.346352e+06,305316.738133,250423.164557,3.345162e+07
4,243744,6.0,Stanford University,,Santa Clara County,Stanford,CA,94305,www.stanford.edu/,2,...,2190.0,55471,0.039480,1.267870e+06,5.893200e+05,0.000000,1.218337e+05,154247.820672,422262.141968,2.352938e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,201885,,University of Cincinnati-Main Campus,,Hamilton County,Cincinnati,OH,45221-0063,www.uc.edu/,1,...,21865.0,25775,0.848303,7.784164e+04,4.087394e+04,17637.095376,2.462772e+04,10723.743858,32570.532792,3.501580e+05
162,176017,,University of Mississippi,Ole Miss,Lafayette County,University,MS,38677-1848,www.olemiss.edu/,1,...,17512.0,19531,0.896626,1.147974e+05,3.699894e+04,10099.907389,2.054947e+04,8564.382146,55784.071214,2.873557e+05
163,204857,,Ohio University-Main Campus,,Athens County,Athens,OH,45701-2979,https://www.ohio.edu/,1,...,19245.0,21733,0.885520,8.488791e+04,1.470852e+04,21517.585350,2.301068e+04,15501.175218,18410.507906,2.410312e+05
164,106397,,University of Arkansas,University of Arkansas|Arkansas,Washington County,Fayetteville,AR,72701,https://www.uark.edu/,1,...,17743.0,21462,0.826717,5.938287e+04,3.600236e+04,19551.583778,1.285435e+04,8612.124653,11483.937673,3.590190e+05


In [25]:
# Reed College
year_df[year_df["UNITID"] == 209922]

Unnamed: 0,UNITID,USNEWSRANK,INSTNM,IALIAS,COUNTYNM,CITY,STABBR,ZIP,WEBADDR,CONTROL,...,ADMSSN,APPLCN,ACPTRT,EINSTRUCTIONAL,ERESEARCH,EPUBLIC,EACADEMIC,ESTUDENT,EINSTITUTIONAL,ENDOWMENT


In [24]:
# Rice
year_df[year_df["UNITID"] == 227757]

Unnamed: 0,UNITID,USNEWSRANK,INSTNM,IALIAS,COUNTYNM,CITY,STABBR,ZIP,WEBADDR,CONTROL,...,ADMSSN,APPLCN,ACPTRT,EINSTRUCTIONAL,ERESEARCH,EPUBLIC,EACADEMIC,ESTUDENT,EINSTITUTIONAL,ENDOWMENT
15,227757,16.0,Rice University,,Harris County,Houston,TX,77005-1827,www.rice.edu/,2,...,2802.0,29544,0.094842,385114.559919,166601.878147,0.0,45049.007049,86138.364552,37134.339376,8137253.0
