# Analysis of Disparate Impact based on gender and number of publications

In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from metrics import disparate_impact
import matplotlib.pyplot as plt
from icecream import ic
import warnings
import matplotlib

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset preparation

In [26]:
ita_prof = pd.read_csv('../processed_data/italian_researchers.csv', index_col = 0)

In [27]:
ita_prof.head()

Unnamed: 0,Genere,Surname,First_Name,fascia_id,fascia_id_2012,fascia_id_2013,fascia_id_2014,fascia_id_2015,fascia_id_2016,fascia_id_2017,...,citations_1947,citations_1950,citations_1951,citations_2024,citations_1948,citations_1937,citations_1902,citations_1904,citations_1908,citations_1939
2,1,AMORENA,Michele,2,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,ANGELINO,Donato,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,ARFELLI,Giuseppe,1,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,ASTE,Giovanni,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0,BARBONI,Barbara,2,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Aggregate papers

In [28]:
def aggregate_cols(df, param, col_name, year):
  df[col_name] = 0
  for c in df.columns:
    if param in c:
      current_year = c.split('_')[1]
      if int(current_year) <= year:
        df[col_name] = df[col_name] + df[c]
  return df

In [29]:
for i in range(2014, 2024):
  ita_prof = aggregate_cols(ita_prof, 'papers', f'PapersTo{i}', i)

### Aggregate citations

In [30]:
for i in range(2014, 2024):
  ita_prof = aggregate_cols(ita_prof, 'citations', f'CitationsTo{i}', i)

## Disparate Impact on number of publications

In [31]:
def create_entry(year, group, paper_class, discard_role, median):
    positive_label = 2 if discard_role == 1 else 1
    entry = {}
    entry['year'] = year
    entry['paper_class'] = paper_class
    entry['group_size_men'] = group[group['Genere'] == 1].shape[0]
    entry['group_size_women'] = group[group['Genere'] == 0].shape[0]
    entry['pos_class_men'] = group[(group['Genere'] == 1)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['pos_class_women'] = group[(group['Genere'] == 0)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['q3'] = median
    try:
        entry['DI'] = disparate_impact(group[group[f'fascia_id_{i}'] != discard_role], {'Genere': 0}, f'fascia_id_{i}', positive_label)
    except ZeroDivisionError:
        print(f'ZeroDivisionError for year {year} and role {discard_role}')
        entry['DI'] = 0
    return entry


def compute_di(df, year, ris, role):
    df = df[df[f'fascia_id_{i}'] != role]
    q3 = df[f'PapersTo{year}'].quantile(0.75)
    low_group = df[df[f'PapersTo{year}'] <= q3]
    high_group = df[df[f'PapersTo{year}'] > q3]
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, low_group, 'low', role, q3), index=[0])], ignore_index=True)
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, high_group, 'high', role, q3), index=[0])], ignore_index=True)
    return ris

### Researchers vs Associated Professors on full Informatics

In [32]:
res_vs_ass_full = pd.DataFrame()

for i in range(2014, 2023):
  res_vs_ass_full = compute_di(ita_prof, i, res_vs_ass_full, 2)

In [33]:
res_vs_ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2014,low,11043,7892,2176,1197,54.0,0.769724
1,2014,high,4611,1669,2197,811,54.0,0.980552
2,2015,low,11114,7915,2503,1428,59.0,0.8011
3,2015,high,4601,1643,2425,917,59.0,0.944339
4,2016,low,11113,7905,2519,1529,64.0,0.853314
5,2016,high,4548,1601,2310,877,64.0,0.927221
6,2017,low,11133,7886,2581,1573,69.0,0.860392
7,2017,high,4586,1598,2233,863,69.0,0.901614
8,2018,low,10996,7813,2716,1697,73.0,0.879365
9,2018,high,4616,1588,2208,873,73.0,0.870101


### Associated vs Full Professors on full Informatics

In [34]:
ass_full = pd.DataFrame()
for i in range(2014, 2023):
  ass_full = compute_di(ita_prof, i, ass_full, 1)

In [35]:
ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2014,low,10420,7072,1337,245,58.0,0.269998
1,2014,high,4537,1158,2339,432,58.0,0.723625
2,2015,low,9957,6801,1259,255,61.0,0.296531
3,2015,high,4445,1092,2356,425,61.0,0.734282
4,2016,low,10052,6756,1291,287,68.0,0.330764
5,2016,high,4449,1076,2378,445,68.0,0.773746
6,2017,low,10023,6718,1297,302,74.0,0.347396
7,2017,high,4493,1084,2314,452,74.0,0.809622
8,2018,low,9941,6620,1358,349,81.0,0.38592
9,2018,high,4465,1048,2360,488,81.0,0.880984


## Disparate Impact on number of citations and publications

In [36]:
def create_entry(year, group, paper_class, discard_role, median_papers, median_citations):
    positive_label = 2 if discard_role == 1 else 1
    entry = {}
    entry['year'] = year
    entry['class'] = paper_class
    entry['group_size_men'] = group[group['Genere'] == 1].shape[0]
    entry['group_size_women'] = group[group['Genere'] == 0].shape[0]
    entry['pos_class_men'] = group[(group['Genere'] == 1)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['pos_class_women'] = group[(group['Genere'] == 0)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['q3_papers'] = median_papers
    entry['q3_citations'] = median_citations
    try:
        entry['DI'] = disparate_impact(group[group[f'fascia_id_{i}'] != discard_role], {'Genere': 0}, f'fascia_id_{i}', positive_label)
    except ZeroDivisionError:
        print(f'ZeroDivisionError for year {year} and role {discard_role}')
        entry['DI'] = 0
    return entry


def compute_di(df, year, ris, role):
    df = df[df[f'fascia_id_{i}'] != role]
    q3_citations = df[f'CitationsTo{year}'].quantile(0.75)
    q3_papers = df[f'PapersTo{year}'].quantile(0.75)
    low_group = df[(df[f'CitationsTo{year}'] <= q3_citations)&(df[f'PapersTo{year}'] <= q3_papers)]
    high_group = df[(df[f'CitationsTo{year}'] > q3_citations)&(df[f'PapersTo{year}'] > q3_papers)]
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, low_group, 'low', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, high_group, 'high', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    return ris

### Researchers vs Associated Professors on full Informatics

In [37]:
res_vs_ass_full = pd.DataFrame()

for i in range(2014, 2023):
  res_vs_ass_full = compute_di(ita_prof, i, res_vs_ass_full, 2)

In [38]:
res_vs_ass_full

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2014,low,10193,7088,1976,1019,54.0,1555.0,0.741593
1,2014,high,3359,1290,1617,651,54.0,1555.0,0.953913
2,2015,low,10240,7102,2238,1171,59.0,1680.0,0.754425
3,2015,high,3346,1284,1791,744,59.0,1680.0,0.923765
4,2016,low,10220,7086,2234,1249,64.0,1776.0,0.80636
5,2016,high,3308,1270,1708,726,64.0,1776.0,0.903212
6,2017,low,10235,7080,2271,1276,69.0,1897.0,0.812247
7,2017,high,3331,1260,1649,706,69.0,1897.0,0.883511
8,2018,low,10123,6994,2382,1376,73.0,1971.0,0.836104
9,2018,high,3318,1238,1606,703,73.0,1971.0,0.852382


### Associated vs Full Professors on full Informatics

In [39]:
ass_full = pd.DataFrame()
for i in range(2014, 2023):
  ass_full = compute_di(ita_prof, i, ass_full, 1)

In [40]:
ass_full

Unnamed: 0,year,class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3_papers,q3_citations,DI
0,2014,low,9644,6416,1209,218,58.0,1643.5,0.271034
1,2014,high,3459,906,1857,350,58.0,1643.5,0.719579
2,2015,low,9226,6206,1138,223,61.0,1720.5,0.291316
3,2015,high,3403,845,1913,345,61.0,1720.5,0.726289
4,2016,low,9285,6199,1142,239,68.0,1882.0,0.313467
5,2016,high,3415,844,1946,367,68.0,1882.0,0.763082
6,2017,low,9303,6189,1147,250,74.0,2026.0,0.327627
7,2017,high,3469,860,1922,382,74.0,2026.0,0.801707
8,2018,low,9249,6110,1185,285,81.0,2191.75,0.364066
9,2018,high,3477,840,1964,420,81.0,2191.75,0.885183
