In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Analysis of additionally gathered data
Based on the data extracted from the first round I import the three spreadsheets and look at affiliations, justifications and diseases in turn in this notebook, gathering some numbers and insights for analysis

In [2]:
#reading in files
aff = pd.read_csv('affiliations_data.csv', sep = ';')
jus = pd.read_csv('justification_data.csv', sep = ';')
dis = pd.read_csv('disease_data.csv', sep = ';')

In [3]:
#filling in nan values with 0's for easier handling
aff = aff.fillna(0)
dis = dis.fillna(0)

In [4]:
#removing id columns for increased readability
aff.drop(["list of 2012 ids", "list of 2021 ids"], axis = 1, inplace = True)

In [5]:
#adding count
aff['count']=aff['Counter 2012'] + aff['Counter 2021']
aff = aff.sort_values(by=['count'], ascending = False)

## Affiliations


In [6]:
#dictionary with country as key and categories as list items in value

def create_country_cat(df):
    dic = {}
    for index in range(len(df)):
        country = df.iloc[index, 3].strip()
        category = df.iloc[index, 2].strip()
        total_count = df.iloc[index, 5]
        count12 =df.iloc[index, 0]
        count21 = df.iloc[index, 1]
        if country not in dic:         
            dic[country] = [[category, total_count, count12, count21]] #add new empty list if country not in dic
        else: 
            dic[country].append([category, total_count, count12, count21])
    return dic


In [7]:
#[gov, uni, hosp, corp]
#count = 1 to get total count
#count = 2 to get 2012 count
#count = 3 to get 2021 count
def count_country_cat(dic, count):
    count_dic = {}
    for key in dic.keys():
        count_list = [0,0,0,0]
        for category in dic.get(key):
            #if the category name matches add the count to the appropriate list
            if category[0] == 'gov':
                count_list[0]+=category[count]
            
            elif category[0] == 'uni':
                count_list[1]+=category[count]
            
            elif category[0] == 'hosp':
                count_list[2]+=category[count]
            
            elif category[0] == 'corp':
                count_list[3]+=category[count]

            #making them percentages instead
            #total = sum(count_list)
            #if total != 0:
               # count_list[0] = count_list[0]/total*100
               # count_list[1] = count_list[1]/total*100
               # count_list[2] = count_list[2]/total*100
               # count_list[3] = count_list[3]/total*100
            
        count_dic[key] = count_list
    return count_dic

In [8]:
#count = 1 to get total count
#count = 2 to get 2012 count
#count = 3 to get 2021 count
#cut_off is for the threshold
def make_country_df(aff, count, cut_off):

    
    country_category = count_country_cat(create_country_cat(aff), count)
    df = pd.DataFrame.from_dict(country_category)
    row_names = {0:'Government', 1:'University', 2:'Hospital', 3: 'Corporation'}
    df.rename(index= row_names, inplace=True)
    df= df.transpose()
    
    
    #adding count
    df['count']=df.sum(axis = 1)
    df = df.sort_values(by=['count'], ascending = False)
    
    #adding threshold
    threshold = df[df['count']> cut_off]
    
    #adding max value
    #val = threshold.iloc[:, [0, 1, 2, 3]].idxmax(axis=1)
    #threshold['max'] = val
    
    
    sum_corp = threshold.iloc[:, 3].sum(axis = 0)
    
    return sum_corp, threshold



In [9]:
sum_corp12, df2012 = make_country_df(aff, 2, 0)
#38 positive hits in 2012
sum_corp21, df2021 = make_country_df(aff, 3, 0)
#73 positive hits in 2021

## Finding elite universities

In [10]:
universities = aff[aff['Type']=='uni']
universities

Unnamed: 0,Counter 2012,Counter 2021,Type,Country,Name,count
232,3.0,3.0,uni,United States,University of North Carolina at Chapel Hill,6.0
57,2.0,3.0,uni,China,Shanghai Jiao Tong University,5.0
233,4.0,1.0,uni,United States,University of Pennsylvania,5.0
195,2.0,3.0,uni,United States,Johns Hopkins University,5.0
28,0.0,5.0,uni,China,Chinese Academy of Sciences,5.0
...,...,...,...,...,...,...
108,1.0,0.0,uni,Germany,University of Luebeck,1.0
109,0.0,1.0,uni,Germany,University of Tubingen,1.0
110,1.0,0.0,uni,Germany,University of Wuppertal,1.0
111,0.0,1.0,uni,Greece,University of Thessaly,1.0


In [11]:
elite = pd.read_csv('elite_universities.csv', sep = ';')
elite = elite.iloc[0:50, :]

In [12]:
elite_dic = {}
for index in range(len(elite)):
    elite_dic[elite.iloc[index, 1].strip()] = [1, elite.iloc[index, 2]]

In [13]:
elite_list = []
for index in range(len(universities)):
    if universities.iloc[index, 4].strip() not in elite_dic:
        elite_list.append('non-elite')
    else:
        elite_list.append('elite')


In [14]:
universities['elite']= elite_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  universities['elite']= elite_list


In [15]:
universities[universities['elite'] == 'elite']

Unnamed: 0,Counter 2012,Counter 2021,Type,Country,Name,count,elite
57,2.0,3.0,uni,China,Shanghai Jiao Tong University,5.0,elite
233,4.0,1.0,uni,United States,University of Pennsylvania,5.0,elite
195,2.0,3.0,uni,United States,Johns Hopkins University,5.0,elite
226,3.0,0.0,uni,United States,"University of California, Los Angeles (UCLA)",3.0,elite
71,0.0,3.0,uni,Hong Kong SAR,The Chinese University of Hong Kong (CUHK),3.0,elite
102,0.0,3.0,uni,Germany,Technical University of Munich,3.0,elite
174,2.0,1.0,uni,United Kingdom,University of Oxford,3.0,elite
113,0.0,3.0,uni,Hong Kong SAR,The University of Hong Kong,3.0,elite
163,1.0,2.0,uni,United Kingdom,King's College London,3.0,elite
200,0.0,2.0,uni,United States,Massachusetts Institute of Technology (MIT),2.0,elite


In [16]:
#dividing into 2012 and 2021 to see the split of elite vs non-elite universities funding
uni_part = universities[universities['Counter 2012'] > 0]
num_art = uni_part['count'].sum()
print('2012: ')
print('number of articles affiliated with universities: ', num_art)
print(num_art/len(aff[aff['Counter 2012']> 0]))
print(len(uni_part), 'universities in total')
print('elite: ', len(uni_part[uni_part['elite'] == 'elite']), ' non-elite: ', len(uni_part[uni_part['elite'] == 'non-elite']))

uni_part = universities[universities['Counter 2021'] > 0]
num_art = uni_part['count'].sum()
print('2021: ')
print('number of articles affiliated with universities: ', num_art)
print(num_art/len(aff[aff['Counter 2021']> 0]))
print(len(uni_part), 'universities in total')
print('elite: ', len(uni_part[uni_part['elite'] == 'elite']), ' non-elite: ', len(uni_part[uni_part['elite'] == 'non-elite']))


2012: 
number of articles affiliated with universities:  84.0
0.9333333333333333
48 universities in total
elite:  16  non-elite:  32
2021: 
number of articles affiliated with universities:  159.0
0.8932584269662921
94 universities in total
elite:  21  non-elite:  73


In [17]:
16/48

0.3333333333333333

In [18]:
21/94

0.22340425531914893

## Looking at corporations

In [19]:
corporations = aff[aff['Type']=='corp']
corporations

Unnamed: 0,Counter 2012,Counter 2021,Type,Country,Name,count
192,1.0,2.0,corp,United States,IBM,3.0
37,0.0,3.0,corp,China,"Huawei Technologies Co., Ltd.",3.0
210,0.0,2.0,corp,United States,PAII Inc,2.0
53,0.0,2.0,corp,China,PingAn Technology,2.0
134,1.0,1.0,corp,Singapore,Singapore ASTAR SERC,2.0
154,0.0,2.0,corp,Taiwan,aetherAI,2.0
164,0.0,2.0,corp,United Kingdom,Astrazeneca,2.0
175,0.0,2.0,corp,United Kingdom,Wellcome Trust,2.0
180,0.0,2.0,corp,United States,Amazon,2.0
50,0.0,2.0,corp,China,Pazhou Lab,2.0


In [20]:
#dividing into 2012 and 2021 to see the split of corporation funding for all articles funded
uni_part = corporations[corporations['Counter 2012'] > 0]
num_art = uni_part['count'].sum()
print('2012: ', num_art,' funded by corporations, out of ' ,len(aff[aff['Counter 2012']> 0]), 'in total')
print(num_art/len(aff[aff['Counter 2012']> 0]))


uni_part = corporations[corporations['Counter 2021'] > 0]
num_art = uni_part['count'].sum()
print('2021: ', num_art,' funded by corporations, out of ', len(aff[aff['Counter 2021']> 0]) ,'in total')
print(num_art/len(aff[aff['Counter 2021']> 0]))

2012:  20.0  funded by corporations, out of  90 in total
0.2222222222222222
2021:  51.0  funded by corporations, out of  178 in total
0.28651685393258425


## Hopsitals and government affilitations in numbers

In [21]:
hospital = aff[aff['Type']=='hosp']
hospital

Unnamed: 0,Counter 2012,Counter 2021,Type,Country,Name,count
177,2.0,0.0,hosp,United States,Childrens Hospital of Philadelphia,2.0
240,0.0,2.0,hosp,United States,Vanderbilt University Medical Center,2.0
169,1.0,0.0,hosp,United Kingdom,RCUK Centre for Doctoral Training in Healthcar...,1.0
168,0.0,1.0,hosp,United Kingdom,National Hospital for Neurology and Neurosurgery,1.0
162,1.0,0.0,hosp,Uganda,"Mulago Hospital of Kampala, Uganda",1.0
155,0.0,1.0,hosp,Taiwan,Chang Gung Memorial Hospital,1.0
158,0.0,1.0,hosp,Taiwan,National Taiwan University Hospital,1.0
185,1.0,0.0,hosp,United States,"Children Hospital of L.A., Department of Patho...",1.0
203,0.0,1.0,hosp,United States,Newark Beth Israel Medical Center,1.0
223,1.0,0.0,hosp,United States,Stony Brook University Medical Center,1.0


In [22]:
#dividing into 2012 and 2021 to see the split of hospital funding for all articles funded
uni_part = hospital[hospital['Counter 2012'] > 0]
num_art = uni_part['count'].sum()
print('2012: ', num_art,' funded by hospitals, out of ' ,len(aff[aff['Counter 2012']> 0]), 'in total')
print(num_art/len(aff[aff['Counter 2012']> 0]))


uni_part = hospital[hospital['Counter 2021'] > 0]
num_art = uni_part['count'].sum()
print('2021: ', num_art,' funded by hospitals, out of ', len(aff[aff['Counter 2021']> 0]) ,'in total')
print(num_art/len(aff[aff['Counter 2021']> 0]))

2012:  17.0  funded by hospitals, out of  90 in total
0.18888888888888888
2021:  15.0  funded by hospitals, out of  178 in total
0.08426966292134831


In [23]:
government = aff[aff['Type']=='gov']
government

Unnamed: 0,Counter 2012,Counter 2021,Type,Country,Name,count
48,3.0,13.0,gov,China,"NSFC, National Natural Science Foundation of C...",16.0
204,7.0,7.0,gov,United States,NIH,14.0
207,2.0,3.0,gov,United States,NSF,5.0
45,0.0,5.0,gov,China,National Key Research and Development Program ...,5.0
82,1.0,3.0,gov,EU,European Commission,4.0
40,0.0,3.0,gov,China,Key-Area Research and Development Program of G...,3.0
58,0.0,3.0,gov,China,Shanghai Municipal Science and Technology Majo...,3.0
141,0.0,2.0,gov,Spain,Spanish Ministry of Science,2.0
171,0.0,2.0,gov,United Kingdom,UK gov,2.0
94,0.0,2.0,gov,Germany,"German Ministry of Science and Education, Bava...",2.0


In [24]:
#dividing into 2012 and 2021 to see the split of goverment funding for all articles funded
uni_part = government[government['Counter 2012'] > 0]
num_art = uni_part['count'].sum()
print('2012: ', num_art,' funded by governments, out of ' ,len(aff[aff['Counter 2012']> 0]), 'in total')
print(num_art/len(aff[aff['Counter 2012']> 0]))


uni_part = government[government['Counter 2021'] > 0]
num_art = uni_part['count'].sum()
print('2021: ', num_art,' funded by governments, out of ', len(aff[aff['Counter 2021']> 0]) ,'in total')
print(num_art/len(aff[aff['Counter 2021']> 0]))

2012:  45.0  funded by governments, out of  90 in total
0.5
2021:  81.0  funded by governments, out of  178 in total
0.4550561797752809


## Justification

In [41]:
def count_cat(df, year):
    df = df[df['year'] == year]
    dic = {}
    option_list= []
    for element in df.iloc[:, 1]:
        if "," in element:
            multiples = element.split(", ")
            for option in multiples:
                option_list.append(option)
        else:
            option_list.append(element)
    for option in option_list:
        if option not in dic:
            dic[option] = 1
        else:
            dic[option] = dic.get(option) + 1 
    return dic

In [42]:
cat2012 = count_cat(jus, 2012)
cat2021 = count_cat(jus, 2021)

In [43]:
for key in cat2012.keys():
    print(key)
    print(cat2012.get(key))
    print(cat2012.get(key)/38*100)

sci
25
65.78947368421053
dis
12
31.57894736842105
nov
9
23.684210526315788
hc
7
18.421052631578945


In [44]:
for key in cat2021.keys():
    print(key)
    print(cat2021.get(key))
    print(cat2021.get(key)/73*100)

sci
55
75.34246575342466
dis
26
35.61643835616438
nov
17
23.28767123287671
hc
22
30.136986301369863


## Disease type/task/body part
The findings in this section are written into a table in an appendix in my thesis

In [155]:
def count_column(df, column):
    dic = {}
    option_list= []
    for element in df.iloc[:, column]:
        if element != 0:
            option_list.append(element)
    for option in option_list:
        if option not in dic:
            dic[option] = 1
        else:
            dic[option] = dic.get(option) + 1 
    return dic

In [231]:
def option_per_zhou(df, year):
    df = df[df['year'] == year]
    zhou = count_column(df, 3)
    
    for key in zhou.keys():
        counter = zhou.get(key)
        cat = df[df['Zhou category'] == key]
        dis = count_column(cat, 1)
        task = count_column(cat, 4)
        res = dis | task #merging the two dictionaries
        zhou[key] = res
        zhou.get(key)['counter'] = counter
        
    return zhou


In [232]:
distribution2012 = option_per_zhou(dis, 2012)
distribution2021 = option_per_zhou(dis, 2021)

In [233]:
distribution2012

{'all': {'surgery': 1,
  'detect regional differences in images for different diseases': 1,
  'surgery robotic': 1,
  'counter': 3},
 'eyes': {'glaucoma ': 1,
  'location of foveola for different diseases': 1,
  'counter': 2},
 'chest': {'cancer': 2, 'tuberculosis': 1, 'counter': 3},
 'neuro': {'alzheimers': 3,
  'cancer': 1,
  'mild cognitive impairment': 1,
  'autism': 2,
  'multiple sclerosis': 1,
  "parkinson's disease": 2,
  'measuring nerve fibers': 1,
  'population based pattern recognition brain in particular': 1,
  'neonate brain mapping': 1,
  'different applications': 1,
  'counter': 14},
 'microscopy': {'cancer': 2,
  'metabolic response of cells': 1,
  'cell death event detection': 1,
  'cell feature analysis': 1,
  'cell detection': 1,
  'vesicle fusion/undocking': 1,
  'phase contrast microscopy': 1,
  'counter': 8},
 'abdomen': {'cancer': 2, 'verebral body fractures': 1, 'counter': 3},
 'cardiovascular': {'coronary artery disease': 2,
  'capturing morphological changes 

In [234]:
distribution2021

{'eyes': {'gender prediction': 1,
  'managing imbalanced datasets': 1,
  'fixing overfitting models': 1,
  'imbalanced datasets': 1,
  'unsupervised anomoly detection': 1,
  'counter': 5},
 'neuro': {'alzheimers': 4,
  'bipolar': 1,
  'schizophrenia': 1,
  'intercranial hemorrhage': 1,
  'cancer': 4,
  'autism': 1,
  "parkinson's disease": 1,
  'brain disease': 1,
  'neurodegenerative disorders': 1,
  'Subjective cognitive decline': 1,
  'mild cognitive impairment': 1,
  'major depression disorder': 1,
  'brain midline delineation': 1,
  'harmonization of medical data': 1,
  'data labelling': 1,
  'counter': 21},
 'abdomen': {'cancer': 10,
  'liver steatosis': 1,
  "crohn's disease": 1,
  'lumbar degeneration disease': 1,
  'ulcerative colitis': 1,
  'nonalcoholic fatty liver disease': 1,
  'adolescent idopathic scoliosis': 1,
  'gastrointestinal abnormalities': 1,
  'embryo development stage detection': 1,
  'evaluation of spinal bone lesions': 1,
  'anatomy site recognition': 1,
  'm