# Analysis of Disparate Impact based on gender and number of publications

In [26]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from metrics import disparate_impact
import matplotlib.pyplot as plt
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset preparation

In [30]:
def aggregate_cols(df, param, col_name, year):
  df[col_name] = 0
  for c in df.columns:
    if param in c:
      current_year = c.split('_')[1]
      if int(current_year) <= year:
        df[col_name] = df[col_name] + df[c]
  return df

In [27]:
folder = 'data'

ita_inf = pd.read_csv(os.path.join(folder,'ita_informatics.csv'), index_col = 0)
se = pd.read_csv(os.path.join(folder,"ita_software_eng.csv"), index_col=0)

In [28]:
ita_inf.head()

Unnamed: 0,Genere,fascia_id_2012,fascia_id_2013,fascia_id_2014,fascia_id_2015,fascia_id_2016,fascia_id_2017,fascia_id_2018,fascia_id_2019,fascia_id_2020,...,citations_1947,citations_1950,citations_1951,citations_2024,citations_1948,citations_1937,citations_1902,citations_1904,citations_1908,citations_1939
0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
se.head()

Unnamed: 0,Genere,fascia_id_2012,fascia_id_2013,fascia_id_2014,fascia_id_2015,fascia_id_2016,fascia_id_2017,fascia_id_2018,fascia_id_2019,fascia_id_2020,...,citations_1947,citations_1950,citations_1951,citations_2024,citations_1948,citations_1937,citations_1902,citations_1904,citations_1908,citations_1939
0,1,0.0,0.0,0.0,0.0,1.0,1.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Aggregate papers

In [31]:
for i in range(2018, 2024):
  ita_inf = aggregate_cols(ita_inf, 'papers', f'PapersTo{i}', i)
  se = aggregate_cols(se, 'papers', f'PapersTo{i}', i)

### Aggregate citations

In [32]:
for i in range(2018, 2024):
  ita_inf = aggregate_cols(ita_inf, 'citations', f'CitationsTo{i}', i)
  se = aggregate_cols(se, 'citations', f'CitationsTo{i}', i)

## Disparate Impact on number of citations and publications

In [33]:
def create_entry(year, group, paper_class, discard_role, median_papers, median_citations):
    positive_label = 2 if discard_role == 1 else 1
    entry = {}
    entry['year'] = year
    entry['class'] = paper_class
    entry['group_size_men'] = group[group['Genere'] == 1].shape[0]
    entry['group_size_women'] = group[group['Genere'] == 0].shape[0]
    entry['pos_class_men'] = group[(group['Genere'] == 1)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['pos_class_women'] = group[(group['Genere'] == 0)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['q3_papers'] = median_papers
    entry['q3_citations'] = median_citations
    try:
        entry['DI'] = disparate_impact(group[group[f'fascia_id_{i}'] != discard_role], {'Genere': 0}, f'fascia_id_{i}', positive_label)
    except ZeroDivisionError:
        print(f'ZeroDivisionError for year {year} and role {discard_role}')
        entry['DI'] = 0
    return entry


def compute_di(df, year, ris, role):
    df = df[df[f'fascia_id_{i}'] != role]
    q3_citations = df[f'CitationsTo{year}'].quantile(0.75)
    q3_papers = df[f'PapersTo{year}'].quantile(0.75)
    low_group = df[(df[f'CitationsTo{year}'] <= q3_citations)&(df[f'PapersTo{year}'] <= q3_papers)]
    high_group = df[(df[f'CitationsTo{year}'] > q3_citations)&(df[f'PapersTo{year}'] > q3_papers)]
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, low_group, 'low', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, high_group, 'high', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    return ris

### Researchers vs Associated Professors on full Informatics

In [34]:
res_vs_ass_full = pd.DataFrame()

for i in range(2018, 2023):
  res_vs_ass_full = compute_di(ita_inf, i, res_vs_ass_full, 2)

In [35]:
res_vs_ass_full

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2018,low,10123,6994,2382,1376,73.0,1971.0,0.836104
1,2018,high,3318,1238,1606,703,73.0,1971.0,0.852382
2,2019,low,10057,6948,2596,1618,78.0,2036.0,0.902158
3,2019,high,3285,1219,1540,705,78.0,2036.0,0.810588
4,2020,low,9984,6878,2737,1764,83.0,2106.25,0.935548
5,2020,high,3290,1210,1474,698,83.0,2106.25,0.776661
6,2021,low,9819,6755,2902,2002,88.25,2134.0,0.997221
7,2021,high,3254,1186,1377,686,88.25,2134.0,0.731606
8,2022,low,9784,6698,3352,2428,94.0,2131.0,0.945114
9,2022,high,3232,1158,1360,665,94.0,2131.0,0.732748


### Associated vs Full Professors on full Informatics

In [36]:
ass_full = pd.DataFrame()
for i in range(2018, 2023):
  ass_full = compute_di(ita_inf, i, ass_full, 1)

In [37]:
ass_full

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2018,low,9249,6110,1185,285,81.0,2191.75,0.364066
1,2018,high,3477,840,1964,420,81.0,2191.75,0.885183
2,2019,low,9127,5908,1248,330,90.0,2368.25,0.408495
3,2019,high,3424,809,1950,433,90.0,2368.25,0.939807
4,2020,low,9068,5783,1314,372,99.0,2540.0,0.443921
5,2020,high,3398,798,1936,451,99.0,2540.0,0.991954
6,2021,low,9027,5552,1513,459,110.0,2723.0,0.493251
7,2021,high,3360,767,1932,461,110.0,2723.0,0.95667
8,2022,low,8748,5158,1643,538,122.0,2913.25,0.555357
9,2022,high,3196,731,1830,443,122.0,2913.25,0.944839


### Res vs Associate in SE

In [38]:
rs_ass_se = pd.DataFrame()
for i in range(2019, 2023):
  rs_ass_se = compute_di(se, i, rs_ass_se, 2)

ZeroDivisionError for year 2022 and role 2


In [39]:
rs_ass_se

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2019,low,52,14,11,7,102.5,1907.5,0.423077
1,2019,high,16,2,7,1,102.5,1907.5,0.875
2,2020,low,50,13,8,6,102.25,1994.5,0.346667
3,2020,high,16,1,6,0,102.25,1994.5,0.0
4,2021,low,48,12,9,5,107.5,1913.75,0.45
5,2021,high,15,1,5,0,107.5,1913.75,0.0
6,2022,low,46,11,11,5,108.0,1705.0,0.526087
7,2022,high,14,0,6,0,108.0,1705.0,0.0


### Associate vs Full in SE

In [40]:
ass_full_se = pd.DataFrame()
for i in range(2018, 2023):
  ass_full_se = compute_di(se, i, ass_full_se, 1)

In [41]:
ass_full_se

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2018,low,52,8,5,0,120.0,2035.0,0.0
1,2018,high,14,2,7,1,120.0,2035.0,1.0
2,2019,low,51,8,5,1,133.5,2275.0,0.784314
3,2019,high,15,2,7,1,133.5,2275.0,0.933333
4,2020,low,52,9,8,2,149.0,2461.5,0.692308
5,2020,high,17,1,8,0,149.0,2461.5,0.0
6,2021,low,52,9,10,2,167.75,2495.75,0.865385
7,2021,high,16,1,8,0,167.75,2495.75,0.0
8,2022,low,51,8,11,2,175.0,2532.0,0.862745
9,2022,high,15,1,9,1,175.0,2532.0,0.6
