# Analysis of Disparate Impact based on gender and number of publications

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from metrics import disparate_impact
import matplotlib.pyplot as plt
import matplotlib

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Dataset preparation

In [8]:
ita_prof = pd.read_csv('../processed_data/ita_researchers.csv', index_col = 0)

In [6]:
ita_prof.head()

Unnamed: 0,Genere,fascia_id_2012,fascia_id_2013,fascia_id_2014,fascia_id_2015,fascia_id_2016,fascia_id_2017,fascia_id_2018,fascia_id_2019,fascia_id_2020,...,citations_1947,citations_1950,citations_1951,citations_2024,citations_1948,citations_1937,citations_1902,citations_1904,citations_1908,citations_1939
0,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Aggregate papers

In [9]:
def aggregate_cols(df, param, col_name, year):
  df[col_name] = 0
  for c in df.columns:
    if param in c:
      current_year = c.split('_')[1]
      if int(current_year) <= year:
        df[col_name] = df[col_name] + df[c]
  return df

In [15]:
for i in range(2018, 2024):
  ita_prof = aggregate_cols(ita_prof, 'papers', f'PapersTo{i}', i)

### Aggregate citations

In [16]:
for i in range(2018, 2024):
  ita_prof = aggregate_cols(ita_prof, 'citations', f'CitationsTo{i}', i)

## Disparate Impact on number of publications

In [17]:
def create_entry(year, group, paper_class, discard_role, median):
    positive_label = 2 if discard_role == 1 else 1
    entry = {}
    entry['year'] = year
    entry['paper_class'] = paper_class
    entry['group_size_men'] = group[group['Genere'] == 1].shape[0]
    entry['group_size_women'] = group[group['Genere'] == 0].shape[0]
    entry['pos_class_men'] = group[(group['Genere'] == 1)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['pos_class_women'] = group[(group['Genere'] == 0)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['q3'] = median
    try:
        entry['DI'] = disparate_impact(group[group[f'fascia_id_{i}'] != discard_role], {'Genere': 0}, f'fascia_id_{i}', positive_label)
    except ZeroDivisionError:
        print(f'ZeroDivisionError for year {year} and role {discard_role}')
        entry['DI'] = 0
    return entry


def compute_di(df, year, ris, role):
    df = df[df[f'fascia_id_{i}'] != role]
    q3 = df[f'PapersTo{year}'].quantile(0.75)
    low_group = df[df[f'PapersTo{year}'] <= q3]
    high_group = df[df[f'PapersTo{year}'] > q3]
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, low_group, 'low', role, q3), index=[0])], ignore_index=True)
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, high_group, 'high', role, q3), index=[0])], ignore_index=True)
    return ris

### Researchers vs Associated Professors on full Informatics

In [18]:
res_vs_ass_full = pd.DataFrame()

for i in range(2018, 2023):
  res_vs_ass_full = compute_di(ita_prof, i, res_vs_ass_full, 2)

In [19]:
res_vs_ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2018,low,10996,7813,2716,1697,73.0,0.879365
1,2018,high,4616,1588,2208,873,73.0,0.870101
2,2019,low,10942,7765,2998,1967,78.0,0.924545
3,2019,high,4572,1549,2161,878,78.0,0.833884
4,2020,low,10841,7686,3148,2159,83.0,0.967357
5,2020,high,4577,1556,2134,880,83.0,0.824405
6,2021,low,10640,7555,3327,2440,88.25,0.968179
7,2021,high,4515,1550,2065,879,88.25,0.806501
8,2022,low,10615,7495,3811,2910,94.0,0.924693
9,2022,high,4454,1513,2072,879,94.0,0.800736


### Associated vs Full Professors on full Informatics

In [20]:
ass_full = pd.DataFrame()
for i in range(2018, 2023):
  ass_full = compute_di(ita_prof, i, ass_full, 1)

In [21]:
ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2018,low,9941,6620,1358,349,81.0,0.38592
1,2018,high,4465,1048,2360,488,81.0,0.880984
2,2019,low,9805,6388,1456,416,90.0,0.438545
3,2019,high,4366,1005,2360,508,90.0,0.935124
4,2020,low,9747,6217,1560,467,99.0,0.469334
5,2020,high,4301,982,2352,529,99.0,0.985091
6,2021,low,9702,5964,1796,575,110.0,0.520817
7,2021,high,4236,955,2379,558,110.0,0.961187
8,2022,low,9412,5541,1958,690,122.0,0.598591
9,2022,high,4035,908,2303,540,122.0,0.959715


## Disparate Impact on number of citations and publications

In [12]:
def create_entry(year, group, paper_class, discard_role, median_papers, median_citations):
    positive_label = 2 if discard_role == 1 else 1
    entry = {}
    entry['year'] = year
    entry['class'] = paper_class
    entry['group_size_men'] = group[group['Genere'] == 1].shape[0]
    entry['group_size_women'] = group[group['Genere'] == 0].shape[0]
    entry['pos_class_men'] = group[(group['Genere'] == 1)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['pos_class_women'] = group[(group['Genere'] == 0)&(group[f'fascia_id_{i}'] == positive_label)].shape[0]
    entry['q3_papers'] = median_papers
    entry['q3_citations'] = median_citations
    try:
        entry['DI'] = disparate_impact(group[group[f'fascia_id_{i}'] != discard_role], {'Genere': 0}, f'fascia_id_{i}', positive_label)
    except ZeroDivisionError:
        print(f'ZeroDivisionError for year {year} and role {discard_role}')
        entry['DI'] = 0
    return entry


def compute_di(df, year, ris, role):
    df = df[df[f'fascia_id_{i}'] != role]
    q3_citations = df[f'CitationsTo{year}'].quantile(0.75)
    q3_papers = df[f'PapersTo{year}'].quantile(0.75)
    low_group = df[(df[f'CitationsTo{year}'] <= q3_citations)&(df[f'PapersTo{year}'] <= q3_papers)]
    high_group = df[(df[f'CitationsTo{year}'] > q3_citations)&(df[f'PapersTo{year}'] > q3_papers)]
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, low_group, 'low', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    ris = pd.concat([ris, pd.DataFrame(create_entry(year, high_group, 'high', role, q3_papers, q3_citations), index=[0])], ignore_index=True)
    return ris

### Researchers vs Associated Professors on full Informatics

In [22]:
res_vs_ass_full = pd.DataFrame()

for i in range(2018, 2023):
  res_vs_ass_full = compute_di(ita_prof, i, res_vs_ass_full, 2)

In [23]:
res_vs_ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2018,low,10996,7813,2716,1697,73.0,0.879365
1,2018,high,4616,1588,2208,873,73.0,0.870101
2,2019,low,10942,7765,2998,1967,78.0,0.924545
3,2019,high,4572,1549,2161,878,78.0,0.833884
4,2020,low,10841,7686,3148,2159,83.0,0.967357
5,2020,high,4577,1556,2134,880,83.0,0.824405
6,2021,low,10640,7555,3327,2440,88.25,0.968179
7,2021,high,4515,1550,2065,879,88.25,0.806501
8,2022,low,10615,7495,3811,2910,94.0,0.924693
9,2022,high,4454,1513,2072,879,94.0,0.800736


### Associated vs Full Professors on full Informatics

In [24]:
ass_full = pd.DataFrame()
for i in range(2018, 2023):
  ass_full = compute_di(ita_prof, i, ass_full, 1)

In [25]:
ass_full

Unnamed: 0,year,paper_class,group_size_men,group_size_women,pos_class_men,pos_class_women,q3,DI
0,2018,low,9941,6620,1358,349,81.0,0.38592
1,2018,high,4465,1048,2360,488,81.0,0.880984
2,2019,low,9805,6388,1456,416,90.0,0.438545
3,2019,high,4366,1005,2360,508,90.0,0.935124
4,2020,low,9747,6217,1560,467,99.0,0.469334
5,2020,high,4301,982,2352,529,99.0,0.985091
6,2021,low,9702,5964,1796,575,110.0,0.520817
7,2021,high,4236,955,2379,558,110.0,0.961187
8,2022,low,9412,5541,1958,690,122.0,0.598591
9,2022,high,4035,908,2303,540,122.0,0.959715
