<a href="https://colab.research.google.com/github/dea1013/DS650-Project/blob/main/diversity_index_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages and data

In [194]:
import pandas as pd
import numpy as np

In [195]:
in_path = "/content/drive/MyDrive/Spring 2023/Data Visualization/Project/college_rankings_merged_with_ScoreCard_data.xlsx"
df = pd.read_excel(in_path, sheet_name = "Sheet1")

# Base columns

In [196]:
key_cols= ['year','INSTNM']
world_cols = ['world_rank_THE','world_rank_RUR','world_rank_CWUR','score_THE','score_RUR','score_CWUR']

# Feature Engineering

## UGDS Race Score

In [197]:
UGDS_race_cols = [
    'UGDS_WHITE',
    'UGDS_BLACK',
    'UGDS_HISP',
    'UGDS_ASIAN',
    'UGDS_AIAN',
    'UGDS_NHPI',
    'UGDS_2MOR',
    'UGDS_NRA',
    'UGDS_UNKN',
]
df['UGDS_race_score'] = df[UGDS_race_cols].var(axis=1)
UGDS_race_cols.append('UGDS_race_score')

## UGDS Gender Score

In [198]:
UGDS_gender_cols = [
    'UGDS_MEN',
    'UGDS_WOMEN',
]
df['UGDS_gender_score'] = df[UGDS_gender_cols].var(axis=1)
UGDS_race_cols.append('UGDS_gender_score')

## First Gen Score

In [199]:
first_gen_cols = [
    'PAR_ED_PCT_1STGEN',
    'PAR_ED_PCT_NOT_1STGEN',
]
df['PAR_ED_PCT_NOT_1STGEN'] = 1 - df['PAR_ED_PCT_1STGEN']
df['first_gen_score'] = df[first_gen_cols].var(axis=1)
first_gen_cols.append('first_gen_score')

## Income Score

In [200]:
income_cols = [
    'INC_PCT_LO',
    'INC_PCT_M1',
    'INC_PCT_M2',
    'INC_PCT_H1',
    'INC_PCT_H2'
]
df['income_score'] = df[income_cols].var(axis=1)
income_cols.append('income_score')

## Female Male Score (Unused)

In [201]:
# female_male_cols = ['male_proportion', 'female_proportion']
# df['male_proportion'] = df['female_male_ratio_THE'].astype("string").str.extract(r'\d\d[ ]*:[ ]*(\d\d)')
# df['male_proportion'] = pd.to_numeric(df['male_proportion'])
# df['female_proportion'] = 100 - df['male_proportion']
# df['female_male_score'] = df[female_male_cols].var(axis=1)

## Final Feature Columns

In [202]:
inverse_weight_cols = ['UGDS_race_score',
                       'UGDS_gender_score',
                       'first_gen_score',
                       'income_score']
for col in inverse_weight_cols:
  df[col] *= -1

In [203]:
feature_cols = [
    'UGDS_race_score',
    'UGDS_gender_score',
    
    'first_gen_score',
    'income_score'
]

df = df[
    key_cols+
    world_cols+
    UGDS_race_cols+
    UGDS_gender_cols+
    first_gen_cols+
    income_cols
]
df['coltype'] = "raw"

# Create min-max scaled columns

In [204]:
df_copy = df.copy()
df_copy[feature_cols] = (df_copy[feature_cols] - df_copy[feature_cols].min())/(df_copy[feature_cols].max() - df_copy[feature_cols].min())


In [205]:
df_copy['coltype'] = 'scaled'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create weight columns

In [206]:
df_copy = df[df['coltype'] == 'scaled'].copy()
df_copy[feature_cols] = df_copy[feature_cols].var()/df_copy[feature_cols].var().sum()
# df[weight_cols] = 1/len(weight_cols)

In [207]:
df_copy['coltype'] = 'weight'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create score columns

In [208]:
df_copy = df[df['coltype'] == 'raw'].copy()
for col in feature_cols:
  df_copy[col] = df[df['coltype'] == 'scaled'][col].reset_index(drop=True) * \
  df[df['coltype'] == 'weight'][col].reset_index(drop=True)

In [209]:
df_copy['coltype'] = 'score'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create diversity columns

In [210]:
df_copy = df[df['coltype'] == 'score'][key_cols+feature_cols].copy()
df_copy['diversity_index'] = df_copy[feature_cols].sum(axis=1)
df_copy['diversity_index'] = (df_copy['diversity_index'] - df_copy['diversity_index'].min())/\
(df_copy['diversity_index'].max() - df_copy['diversity_index'].min())
df_copy['diversity_rank'] = df_copy['diversity_index'].rank(ascending=False)
df_copy['diversity_year_rank'] = df_copy.groupby('year')['diversity_index'].rank(ascending=False)

In [211]:
df_copy = df_copy[key_cols+['diversity_index','diversity_rank','diversity_year_rank']]
df = df.merge(df_copy,on=['year','INSTNM'])

In [212]:
df['diversity_index'] = df.mask(df['coltype'] != 'score')['diversity_index']

In [213]:
df = df.sort_values(by=['year','diversity_year_rank'])

In [214]:
df

Unnamed: 0,year,INSTNM,world_rank_THE,world_rank_RUR,world_rank_CWUR,score_THE,score_RUR,score_CWUR,UGDS_WHITE,UGDS_BLACK,...,INC_PCT_LO,INC_PCT_M1,INC_PCT_M2,INC_PCT_H1,INC_PCT_H2,income_score,coltype,diversity_index,diversity_rank,diversity_year_rank
512,2012,California State University-Long Beach,,630.0,,,12.196,,0.2527,0.0406,...,0.478711,0.149444,0.142386,0.107710,0.121749,-0.024549,raw,,8.0,1.0
513,2012,California State University-Long Beach,,630.0,,,12.196,,0.2527,0.0406,...,0.478711,0.149444,0.142386,0.107710,0.121749,0.687393,scaled,,8.0,1.0
514,2012,California State University-Long Beach,,630.0,,,12.196,,0.2527,0.0406,...,0.478711,0.149444,0.142386,0.107710,0.121749,0.149704,weight,,8.0,1.0
515,2012,California State University-Long Beach,,630.0,,,12.196,,0.2527,0.0406,...,0.478711,0.149444,0.142386,0.107710,0.121749,0.102906,score,0.994246,8.0,1.0
232,2012,University of California-Riverside,143,234.0,,47.5,62.353,,0.1571,0.0646,...,0.402910,0.181626,0.182096,0.113340,0.120028,-0.013934,raw,,9.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2159,2015,Yeshiva University,186,81.0,171.0,46.7,80.001,48.11,0.9364,0.0207,...,0.286676,0.110363,0.123822,0.145357,0.333782,0.131875,score,0.001836,682.0,207.0
2504,2015,Brigham Young University,,,403.0,,,45.15,0.8320,0.0050,...,0.619696,0.075460,0.093375,0.096777,0.114692,-0.055239,raw,,683.0,208.0
2505,2015,Brigham Young University,,,403.0,,,45.15,0.8320,0.0050,...,0.619696,0.075460,0.093375,0.096777,0.114692,0.262896,scaled,,683.0,208.0
2506,2015,Brigham Young University,,,403.0,,,45.15,0.8320,0.0050,...,0.619696,0.075460,0.093375,0.096777,0.114692,0.149704,weight,,683.0,208.0


# Export

In [215]:
out_path = '/content/drive/MyDrive/Spring 2023/Data Visualization/Project/diversity_df_final.csv'
df.to_csv(out_path,index=0)