<a href="https://colab.research.google.com/github/dea1013/DS650-Project/blob/main/diversity_index_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages and data

In [127]:
import pandas as pd
import numpy as np

In [128]:
in_path = "/content/drive/MyDrive/Spring 2023/Data Visualization/Project/college_rankings_merged_with_ScoreCard_data.xlsx"
df = pd.read_excel(in_path, sheet_name = "Sheet1")

# Base columns

In [129]:
key_cols= ['year','INSTNM']
world_cols = ['world_rank_THE','world_rank_RUR','world_rank_CWUR','score_THE','score_RUR','score_CWUR']

# Feature Engineering

## UGDS Score

In [130]:
UGDS_cols = [
    'UGDS_WHITE',
    'UGDS_BLACK',
    'UGDS_HISP',
    'UGDS_ASIAN',
    'UGDS_AIAN',
    'UGDS_NHPI',
    'UGDS_2MOR',
    'UGDS_NRA',
    'UGDS_UNKN',
]
df['UGDS_score'] = df[UGDS_cols].var(axis=1)

## Female Male Score

In [131]:
female_male_cols = ['male_proportion', 'female_proportion']
df['male_proportion'] = df['female_male_ratio_THE'].astype("string").str.extract(r'\d\d[ ]*:[ ]*(\d\d)')
df['male_proportion'] = pd.to_numeric(df['male_proportion'])
df['female_proportion'] = 100 - df['male_proportion']
df['female_male_score'] = df[female_male_cols].var(axis=1)

## Final Feature Columns

In [132]:
inverse_weight_cols = ['UGDS_score', 'female_male_score']
for col in inverse_weight_cols:
  df[col] *= -1

In [133]:
feature_cols = [
    'international_students_THE',
    'UGDS_score',
    'female_male_score',
]

df = df[key_cols+
        world_cols+
        ['international_students_THE']+
        ['UGDS_score']+UGDS_cols+
        ['female_male_score']+female_male_cols]
df['coltype'] = "raw"

# Create min-max scaled columns

In [134]:
df_copy = df.copy()
df_copy[feature_cols] = (df_copy[feature_cols] - df_copy[feature_cols].min())/(df_copy[feature_cols].max() - df_copy[feature_cols].min())


In [135]:
df_copy['coltype'] = 'scaled'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create weight columns

In [136]:
df_copy = df[df['coltype'] == 'scaled'].copy()
df_copy[feature_cols] = df_copy[feature_cols].var()/df_copy[feature_cols].var().sum()
# df[weight_cols] = 1/len(weight_cols)

In [137]:
df_copy['coltype'] = 'weight'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create score columns

In [138]:
df_copy = df[df['coltype'] == 'raw'].copy()
for col in feature_cols:
  df_copy[col] = df[df['coltype'] == 'scaled'][col].reset_index(drop=True) * \
  df[df['coltype'] == 'weight'][col].reset_index(drop=True)

In [139]:
df_copy['coltype'] = 'score'
df = pd.concat([df,df_copy]).reset_index(drop=True)

# Create diversity columns

In [140]:
df_copy = df[df['coltype'] == 'score'][key_cols+feature_cols].copy()
df_copy['diversity_index'] = df_copy[feature_cols].sum(axis=1)
df_copy['diversity_index'] = (df_copy['diversity_index'] - df_copy['diversity_index'].min())/\
(df_copy['diversity_index'].max() - df_copy['diversity_index'].min())
df_copy['diversity_rank'] = df_copy['diversity_index'].rank(ascending=False)
df_copy['diversity_year_rank'] = df_copy.groupby('year')['diversity_index'].rank(ascending=False)

In [141]:
df_copy = df_copy[key_cols+['diversity_index','diversity_rank','diversity_year_rank']]
df = df.merge(df_copy,on=['year','INSTNM'])

In [142]:
df['diversity_index'] = df.mask(df['coltype'] != 'score')['diversity_index']

In [143]:
df = df.sort_values(by=['year','diversity_year_rank'])

In [144]:
df

Unnamed: 0,year,INSTNM,world_rank_THE,world_rank_RUR,world_rank_CWUR,score_THE,score_RUR,score_CWUR,international_students_THE,UGDS_score,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,female_male_score,male_proportion,female_proportion,coltype,diversity_index,diversity_rank,diversity_year_rank
48,2012,Carnegie Mellon University,21,33.0,43.0,78.4,84.669,51.60,0.350000,-0.018775,...,0.0328,0.1212,0.0640,-242.000000,61.0,39.0,raw,,4.0,1.0
49,2012,Carnegie Mellon University,21,33.0,43.0,78.4,84.669,51.60,1.000000,0.916637,...,0.0328,0.1212,0.0640,0.806400,61.0,39.0,scaled,,4.0,1.0
50,2012,Carnegie Mellon University,21,33.0,43.0,78.4,84.669,51.60,0.353979,0.464275,...,0.0328,0.1212,0.0640,0.181746,61.0,39.0,weight,,4.0,1.0
51,2012,Carnegie Mellon University,21,33.0,43.0,78.4,84.669,51.60,0.353979,0.425572,...,0.0328,0.1212,0.0640,0.146560,61.0,39.0,score,0.980240,4.0,1.0
8,2012,Massachusetts Institute of Technology,7,4.0,2.0,92.3,95.174,91.67,0.330000,-0.014674,...,0.0335,0.0992,0.0403,-338.000000,63.0,37.0,raw,,5.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2359,2015,North Dakota State University-Main Campus,,481.0,792.0,,46.218,44.25,,0.078137,...,0.0200,0.0317,0.0151,,,,score,0.059744,674.0,207.0
2640,2015,Howard University,,,719.0,,,44.33,,-0.089351,...,0.0224,0.0419,0.0050,,,,raw,,682.0,208.0
2641,2015,Howard University,,,719.0,,,44.33,,0.069674,...,0.0224,0.0419,0.0050,,,,scaled,,682.0,208.0
2642,2015,Howard University,,,719.0,,,44.33,0.353979,0.464275,...,0.0224,0.0419,0.0050,0.181746,,,weight,,682.0,208.0


# Export

In [145]:
out_path = '/content/drive/MyDrive/Spring 2023/Data Visualization/Project/diversity_df_v2.csv'
df.to_csv(out_path,index=0)