In [1]:
import pandas as pd
import numpy as np
import re
from functools import reduce
import seaborn as sns
import missingno as msno
%matplotlib inline

import matplotlib.pyplot as plt
from scipy.stats import norm


In [2]:
# Set the figure size - handy for larger output
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [6, 6]
# Set up with a higher resolution screen (useful on Mac)
%config InlineBackend.figure_format = 'retina'

# Import data

In [3]:
data_prep = pd.read_excel(f'D:\MARBURG VIRUS DISEASES/data/WOSCC/woscc_bon_23_02_verif_on_wos.xls')[
    ['UT (Unique WOS ID)','Funding Name Preferred', 'Open Access Designations','Publisher',#'Source Title',
    'Times Cited, All Databases','180 Day Usage Count', 'Since 2013 Usage Count','Publication Year','Affiliations',
    'Addresses','Reprint Addresses','Author Full Names']]
# create a columns of funding yes or no
data_prep['funding_yes'] = data_prep['Funding Name Preferred'].apply(lambda x: 1 if not pd.isnull(x) else 0)#.to_frame().drop('Funding Orgs', axis=1)     

data_prep['open_access_yes'] = data_prep['Open Access Designations'].apply(lambda x: 1 if not pd.isnull(x) else 0)#.to_frame().drop('Funding Orgs', axis=1)     

data_prep = data_prep.rename(columns={"UT (Unique WOS ID)": "wos_ID"})
data_prep = data_prep.applymap(lambda s:s.lower() if type(s) == str else s)

data_prep.head(3)

Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,Reprint Addresses,Author Full Names,funding_yes,open_access_yes
0,wos:a1968a855300005,,,georg thieme verlag kg,12,0,1,1968.0,,,,"bechtelsheimer, h; jacob, h; solcher, h",0,0
1,wos:a1968a855300004,,,georg thieme verlag kg,42,0,3,1968.0,,,,"gedigk, p; bechtelsheimer, h; korb, g",0,0
2,wos:a1968a855300003,,,georg thieme verlag kg,23,0,1,1968.0,,,,"hennessen, w; bonin, o; mauler, r",0,0


In [4]:
# data_prep['number_of_institution']

In [5]:
data_prep.shape

(932, 14)

In [6]:
data_gender_dummies = pd.read_csv(f'../TABLES/author_gender_list_count_dummies.csv').drop(
                                ['Publication Year','Funding Orgs','funding_yes'], axis=1)
data_gender_dummies = data_gender_dummies.rename(columns={"UT (Unique WOS ID)": "wos_ID"})
data_gender_dummies["wos_ID"] = data_gender_dummies["wos_ID"].str.lower()
data_gender_dummies.head()

Unnamed: 0,wos_ID,female,male,unclassified,unknown,number_of_author,%_female,%_male,%_unclass_name,female_to_male_ratio
0,wos:a1968a855300005,0.0,0.0,3.0,0.0,3.0,0.0,0.0,100.0,
1,wos:a1968a855300004,0.0,0.0,3.0,0.0,3.0,0.0,0.0,100.0,
2,wos:a1968a855300003,0.0,0.0,3.0,0.0,3.0,0.0,0.0,100.0,
3,wos:a1968b963600036,1.0,0.0,1.0,0.0,2.0,50.0,0.0,50.0,inf
4,wos:a1968b963600037,1.0,0.0,2.0,0.0,3.0,33.333333,0.0,66.666667,inf


In [7]:
data_gender_dummies.shape

(932, 10)

In [8]:
first_author_gender = pd.read_csv(f'../DATA_PREPROCESSING/first_author_gender.csv').drop(
                                ['Publication Year','Funding Orgs','funding_yes'], axis=1)
first_author_gender = first_author_gender.rename(columns={"UT (Unique WOS ID)": "wos_ID",'ga_gender':'first_author_gender'})
first_author_gender["wos_ID"] = first_author_gender["wos_ID"].str.lower()
# first_author_gender = pd.get_dummies(first_author_gender.set_index('wos_ID')['first_author_gender'], prefix='first_author').reset_index()
first_author_gender.head()

Unnamed: 0,wos_ID,first_author_gender
0,wos:000904661800001,male
1,wos:000921279800001,male
2,wos:000909001200001,male
3,wos:000777045900001,male
4,wos:000849602200001,male


In [9]:
first_author_gender.shape

(932, 2)

In [10]:
last_author_gender = pd.read_csv(f'../DATA_PREPROCESSING/last_author_gender.csv').drop(
                                ['Publication Year','Funding Orgs','funding_yes','Author Full Names'], axis=1)
last_author_gender = last_author_gender.rename(columns={"UT (Unique WOS ID)": "wos_ID",'gender':'last_author_gender'})
last_author_gender["wos_ID"] = last_author_gender["wos_ID"].str.lower()
# last_author_gender = pd.get_dummies(last_author_gender.set_index('wos_ID')['last_author_gender'], prefix='last_author').reset_index()
last_author_gender.head()

Unnamed: 0,wos_ID,last_author_gender
0,wos:000904661800001,female
1,wos:000921279800001,male
2,wos:000909001200001,male
3,wos:000777045900001,male
4,wos:000849602200001,male


In [11]:
last_author_gender.shape

(932, 2)

In [12]:
first_last_author_gender = pd.merge(first_author_gender, last_author_gender, on='wos_ID')
first_last_author_gender['first_last_author_gender'] = first_last_author_gender['first_author_gender']+'_'+first_last_author_gender['last_author_gender']
first_last_author_gender_dummy = pd.get_dummies(first_last_author_gender.set_index('wos_ID')['first_last_author_gender']).replace({0: np.nan})
first_last_author_gender_dummy.head(2)

Unnamed: 0_level_0,female_female,female_male,female_unclassified,female_unknown,male_female,male_male,male_unclassified,male_unknown,unclassified_female,unclassified_male,unclassified_unclassified,unknown_female,unknown_male,unknown_unknown
wos_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
wos:000904661800001,,,,,1.0,,,,,,,,,
wos:000921279800001,,,,,,1.0,,,,,,,,


In [13]:
first_last_author_gender_dummy.count()

female_female                 73
female_male                  222
female_unclassified            6
female_unknown                 1
male_female                   64
male_male                    391
male_unclassified             16
male_unknown                   1
unclassified_female            1
unclassified_male             51
unclassified_unclassified    103
unknown_female                 1
unknown_male                   1
unknown_unknown                1
dtype: int64

In [14]:
last_author_gender_dummy = pd.get_dummies(last_author_gender.set_index('wos_ID')['last_author_gender'], prefix='last_author').reset_index()
last_author_gender_dummy.head()

Unnamed: 0,wos_ID,last_author_female,last_author_male,last_author_unclassified,last_author_unknown
0,wos:000904661800001,1,0,0,0
1,wos:000921279800001,0,1,0,0
2,wos:000909001200001,0,1,0,0
3,wos:000777045900001,0,1,0,0
4,wos:000849602200001,0,1,0,0


In [15]:
first_author_gender_dummy = pd.get_dummies(first_author_gender.set_index('wos_ID')['first_author_gender'], prefix='first_author').reset_index()
first_author_gender_dummy.head()

Unnamed: 0,wos_ID,first_author_female,first_author_male,first_author_unclassified,first_author_unknown
0,wos:000904661800001,0,1,0,0
1,wos:000921279800001,0,1,0,0
2,wos:000909001200001,0,1,0,0
3,wos:000777045900001,0,1,0,0
4,wos:000849602200001,0,1,0,0


In [16]:
data_topics = pd.read_pickle('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/data_prep_lemmatized_topics.pkl')[['Topic N°']].reset_index()
data_topics.head(2)

Unnamed: 0,wos_ID,Topic N°
0,wos:000904661800001,3
1,wos:000921279800001,5


In [17]:
data_topics.shape

(932, 2)

In [18]:
data_journal_old = pd.read_csv(f"../DATA_PREPROCESSING_old/journal_scope_and_impact_factors.csv")[['UT (Unique WOS ID)','Source Title','IF', 'JCI','percentageOAGold']];
data_journal_old = data_journal_old.rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
data_journal_old['wos_ID'] = data_journal_old['wos_ID'].str.lower()
data_journal_old.head()

  data_journal_old = pd.read_csv(f"../DATA_PREPROCESSING_old/journal_scope_and_impact_factors.csv")[['UT (Unique WOS ID)','Source Title','IF', 'JCI','percentageOAGold']];


Unnamed: 0,wos_ID,Source Title,IF,JCI,percentageOAGold
0,wos:000606843700004,clinical microbiology and infection,13.31,1.71,21.41
1,wos:000244234800016,expert review of vaccines,5.683,0.57,26.9
2,wos:000273203100006,new microbiologica,1.383,0.35,0.0
3,wos:000279358200008,clinics in laboratory medicine,2.172,0.7,0.72
4,wos:000296496500011,future virology,3.015,0.23,6.28


In [19]:
data_journal_old.shape

(1533, 5)

In [20]:
# open access

In [21]:
marburg_data_upw = pd.read_csv(f"../DATA_PREPROCESSING/marburg_data_upw.csv")[['UT (Unique WOS ID)','DOI','is_oa','oa_status']];
marburg_data_upw = marburg_data_upw.rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
marburg_data_upw['wos_ID'] = marburg_data_upw['wos_ID'].str.lower()
marburg_data_upw.head()

Unnamed: 0,wos_ID,DOI,is_oa,oa_status
0,wos:a1968a855300005,10.1055/s-0028-1105102,False,closed
1,wos:a1968a855300004,10.1055/s-0028-1105101,False,closed
2,wos:a1968a855300003,10.1055/s-0028-1105100,False,closed
3,wos:a1968b963600036,,,
4,wos:a1968b963600037,,,


In [22]:
marburg_data_upw.shape

(932, 4)

In [23]:
marburg_data_upw.count()

wos_ID       932
DOI          875
is_oa        683
oa_status    683
dtype: int64

In [24]:
# number of countries

In [25]:
country_data = pd.read_pickle('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/countries_split.pkl').drop('Addresses', axis=1)
country_data['number_of_country'] = country_data.notnull().sum(axis=1)
number_country = country_data[['number_of_country']]
number_country = number_country.reset_index().rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
number_country['wos_ID'] = number_country['wos_ID'].str.lower()
number_country.head()

Unnamed: 0,wos_ID,number_of_country
0,wos:000904661800001,1
1,wos:000921279800001,2
2,wos:000909001200001,1
3,wos:000777045900001,6
4,wos:000849602200001,4


In [26]:
number_country.shape

(932, 2)

In [27]:
# number of institutions
data_prep['number_of_institution'] =  pd.DataFrame(data_prep['Addresses'].str.split(';', expand=True)).count(axis='columns')
number_institution = data_prep[['wos_ID','number_of_institution']]
# number_institution = number_institution.reset_index().rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
number_institution['wos_ID'] = number_institution['wos_ID'].str.lower()
number_institution.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  number_institution['wos_ID'] = number_institution['wos_ID'].str.lower()


Unnamed: 0,wos_ID,number_of_institution
0,wos:a1968a855300005,0
1,wos:a1968a855300004,0
2,wos:a1968a855300003,0
3,wos:a1968b963600036,0
4,wos:a1968b963600037,0


In [28]:
number_institution.shape

(932, 2)

In [29]:
data_countries_all = pd.read_pickle('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/data_concat_countries_and_wb_groups_income.pkl').drop(
                ['Addresses','funding_yes','open_access_yes','Funding Orgs', 'Open Access Designations',
                   'Times Cited, All Databases', '180 Day Usage Count','Since 2013 Usage Count', 'Publication Year',], 
                axis=1)
data_countries_all.head(2)

Unnamed: 0,wos_ID,Afghanistan,United States,Angola,Argentina,Australia,Austria,Bangladesh,Belgium,Belize,...,wb_low_income_economies_africa,wb_low_income_economies_others,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:000904661800001,,,,,,,,,,...,,,,,,1.0,,1.0,,
1,wos:000921279800001,,2.0,,,,,,,,...,,,,,,,,,2.0,2.0


In [30]:
data_countries_all.shape

(932, 113)

In [31]:
data_countries_all.columns[60:]

Index(['Poland', 'Portugal', 'Congo Republic', 'Russia', 'Saudi Arabia',
       'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovakia',
       'South Africa', 'South Korea', 'Spain', 'Sweden', 'Switzerland',
       'Tanzania', 'Thailand', 'Turkey', 'United Arab Emirates', 'Uganda',
       'Ukraine', 'Venezuela', 'Vietnam', 'Zambia', 'Zimbabwe', 'Sudan',
       'year_group', 'north_africa', 'eastern_africa', 'Southern_africa',
       'Western_africa', 'Central_africa', 'non_african_countries',
       'wb_east_asia_and_pacific', 'wb_europe_and_central_asia',
       'wb_latin_america_and_the_caribbean', 'wb_middle_east_and_north_africa',
       'wb_middle_east_and_north_africa_africa',
       'wb_middle_east_and_north_africa_others', 'wb_north_america',
       'wb_south_asia', 'wb_sub_saharan_africa', 'wb_low_income_economies',
       'wb_low_income_economies_africa', 'wb_low_income_economies_others',
       'wb_lower_middle_income_economies',
       'wb_lower_middle_income_economie

In [32]:
first_authors_countries = pd.read_csv('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/data_concat_countries_and_wb_groups_income_first_authors.csv').drop(['Unnamed: 0','Author Full Names','Addresses'], axis=1)
first_authors_countries = first_authors_countries.rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
first_authors_countries['wos_ID'] = first_authors_countries['wos_ID'].str.lower()
first_authors_countries.head(2)

Unnamed: 0,wos_ID,first_author,Argentina,Australia,Bangladesh,Belgium,Belize,United States,Canada,Colombia,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:a1973q658600007,"MARTINI, GA",,,,,,,,,...,,,,,,,,,,
1,wos:a1974t169200002,"MONATH, TP",,,,,,,,,...,,,,,,,,,,


In [33]:
first_authors_countries.shape

(876, 71)

In [34]:
last_authors_countries = pd.read_csv('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/data_concat_countries_and_wb_groups_income_last_authors.csv').drop(['Unnamed: 0','Author Full Names','Addresses'], axis=1)
last_authors_countries = last_authors_countries.rename(columns = {'UT (Unique WOS ID)':'wos_ID'})
last_authors_countries['wos_ID'] = last_authors_countries['wos_ID'].str.lower()
last_authors_countries.head(2)

Unnamed: 0,wos_ID,last_author,Australia,Bangladesh,Belgium,United States,Canada,Colombia,Czech Republic,Denmark,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:a1973q658600007,"MARTINI, GA",,,,,,,,,...,,,,,,,,,,
1,wos:a1974t169200002,"MONATH, TP",,,,,,,,,...,,,,,,,,,,


In [35]:
last_authors_countries.shape

(876, 75)

In [36]:
data_prep.columns

Index(['wos_ID', 'Funding Name Preferred', 'Open Access Designations',
       'Publisher', 'Times Cited, All Databases', '180 Day Usage Count',
       'Since 2013 Usage Count', 'Publication Year', 'Affiliations',
       'Addresses', 'Reprint Addresses', 'Author Full Names', 'funding_yes',
       'open_access_yes', 'number_of_institution'],
      dtype='object')

In [37]:
# no countries
dfs = [data_prep, data_gender_dummies, first_author_gender_dummy, last_author_gender_dummy, first_last_author_gender_dummy, data_topics, 
       data_journal_old, number_country,marburg_data_upw]

marburg_virus_data_clean_concat_and_indicator = reduce(lambda left,right: pd.merge(left,right,on=['wos_ID'], how='left'), dfs)

#save data
marburg_virus_data_clean_concat_and_indicator.to_csv(f'../DATA_PREPROCESSING/marburg_virus_data_clean_concat_and_indicator.csv')
marburg_virus_data_clean_concat_and_indicator.head(2)

Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,unknown_unknown,Topic N°,Source Title,IF,JCI,percentageOAGold,number_of_country,DOI,is_oa,oa_status
0,wos:a1968a855300005,,,georg thieme verlag kg,12,0,1,1968.0,,,...,,8,deutsche medizinische wochenschrift,0.653,0.14,9.74,0,10.1055/s-0028-1105102,False,closed
1,wos:a1968a855300004,,,georg thieme verlag kg,42,0,3,1968.0,,,...,,0,deutsche medizinische wochenschrift,0.653,0.14,9.74,0,10.1055/s-0028-1105101,False,closed


In [38]:
# all countries
dfs = [data_prep, data_gender_dummies, first_author_gender_dummy, last_author_gender_dummy, first_last_author_gender_dummy, data_topics, 
       data_journal_old, number_country, marburg_data_upw,data_countries_all]

marburg_virus_data_clean_concat_and_indicator_all_countries = reduce(lambda left,right: pd.merge(left,right,on=['wos_ID'], how='left'), dfs)

#save data
marburg_virus_data_clean_concat_and_indicator_all_countries.to_csv(f'../DATA_PREPROCESSING/marburg_virus_data_clean_concat_and_indicator_all_countries.csv')
marburg_virus_data_clean_concat_and_indicator_all_countries.head(2)

Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies_africa,wb_low_income_economies_others,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:a1968a855300005,,,georg thieme verlag kg,12,0,1,1968.0,,,...,,,,,,,,,,
1,wos:a1968a855300004,,,georg thieme verlag kg,42,0,3,1968.0,,,...,,,,,,,,,,


In [39]:
# first author countries
dfs = [data_prep, data_gender_dummies, first_author_gender_dummy, last_author_gender_dummy, first_last_author_gender_dummy, data_topics, 
       data_journal_old, number_country, marburg_data_upw,first_authors_countries]

marburg_virus_data_clean_concat_and_indicator_first_author_countries = reduce(lambda left,right: pd.merge(left,right,on=['wos_ID'], how='left'), dfs)

#save data
marburg_virus_data_clean_concat_and_indicator_first_author_countries.to_csv(f'../DATA_PREPROCESSING/marburg_virus_data_clean_concat_and_indicator_first_author_countries.csv')
marburg_virus_data_clean_concat_and_indicator_first_author_countries.head(2)

Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:a1968a855300005,,,georg thieme verlag kg,12,0,1,1968.0,,,...,,,,,,,,,,
1,wos:a1968a855300004,,,georg thieme verlag kg,42,0,3,1968.0,,,...,,,,,,,,,,


In [40]:
# last author countries
dfs = [data_prep, data_gender_dummies, first_author_gender_dummy, last_author_gender_dummy, first_last_author_gender_dummy, data_topics, 
       data_journal_old, number_country, marburg_data_upw, last_authors_countries]

marburg_virus_data_clean_concat_and_indicator_last_author_countries = reduce(lambda left,right: pd.merge(left,right,on=['wos_ID'], how='left'), dfs)

#save data
marburg_virus_data_clean_concat_and_indicator_last_author_countries.to_csv(f'../DATA_PREPROCESSING/marburg_virus_data_clean_concat_and_indicator_last_author_countries.csv')
marburg_virus_data_clean_concat_and_indicator_last_author_countries.head(2)

Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
0,wos:a1968a855300005,,,georg thieme verlag kg,12,0,1,1968.0,,,...,,,,,,,,,,
1,wos:a1968a855300004,,,georg thieme verlag kg,42,0,3,1968.0,,,...,,,,,,,,,,


In [41]:
marburg_virus_data_clean_concat_and_indicator.columns[:60]

Index(['wos_ID', 'Funding Name Preferred', 'Open Access Designations',
       'Publisher', 'Times Cited, All Databases', '180 Day Usage Count',
       'Since 2013 Usage Count', 'Publication Year', 'Affiliations',
       'Addresses', 'Reprint Addresses', 'Author Full Names', 'funding_yes',
       'open_access_yes', 'number_of_institution', 'female', 'male',
       'unclassified', 'unknown', 'number_of_author', '%_female', '%_male',
       '%_unclass_name', 'female_to_male_ratio', 'first_author_female',
       'first_author_male', 'first_author_unclassified',
       'first_author_unknown', 'last_author_female', 'last_author_male',
       'last_author_unclassified', 'last_author_unknown', 'female_female',
       'female_male', 'female_unclassified', 'female_unknown', 'male_female',
       'male_male', 'male_unclassified', 'male_unknown', 'unclassified_female',
       'unclassified_male', 'unclassified_unclassified', 'unknown_female',
       'unknown_male', 'unknown_unknown', 'Topic N°', '

In [42]:
marburg_virus_data_clean_concat_and_indicator.columns[:60:]

Index(['wos_ID', 'Funding Name Preferred', 'Open Access Designations',
       'Publisher', 'Times Cited, All Databases', '180 Day Usage Count',
       'Since 2013 Usage Count', 'Publication Year', 'Affiliations',
       'Addresses', 'Reprint Addresses', 'Author Full Names', 'funding_yes',
       'open_access_yes', 'number_of_institution', 'female', 'male',
       'unclassified', 'unknown', 'number_of_author', '%_female', '%_male',
       '%_unclass_name', 'female_to_male_ratio', 'first_author_female',
       'first_author_male', 'first_author_unclassified',
       'first_author_unknown', 'last_author_female', 'last_author_male',
       'last_author_unclassified', 'last_author_unknown', 'female_female',
       'female_male', 'female_unclassified', 'female_unknown', 'male_female',
       'male_male', 'male_unclassified', 'male_unknown', 'unclassified_female',
       'unclassified_male', 'unclassified_unclassified', 'unknown_female',
       'unknown_male', 'unknown_unknown', 'Topic N°', '

In [43]:
marburg_virus_data_clean_concat_and_indicator.columns[60:]

Index([], dtype='object')

In [44]:
# limit data to after 2007
marburg_virus_data_clean_concat_and_indicator_2007 = marburg_virus_data_clean_concat_and_indicator[marburg_virus_data_clean_concat_and_indicator['Publication Year'] > 2006]
#save data
marburg_virus_data_clean_concat_and_indicator_2007.to_csv(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_2007.csv')
marburg_virus_data_clean_concat_and_indicator_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_2007.xls')
marburg_virus_data_clean_concat_and_indicator_2007.head(2)


  marburg_virus_data_clean_concat_and_indicator_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_2007.xls')


Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,unknown_unknown,Topic N°,Source Title,IF,JCI,percentageOAGold,number_of_country,DOI,is_oa,oa_status
203,wos:000243562000002,,,w b saunders co ltd,10,1,11,2007.0,university of london; london school of hygiene...,"univ london london sch hyg & trop med, dept in...",...,,8,journal of infection,38.637,3.04,19.95,1,10.1016/j.jinf.2006.01.022,False,closed
204,wos:000259305000001,,,springer india,2,1,15,2007.0,sher-e-kashmir university of agricultural scie...,"[taku, anil kumar; bhat, mohd altaf; dutta, ta...",...,,8,indian journal of virology,,,,1,,,


In [45]:
marburg_virus_data_clean_concat_and_indicator_2007.shape

(714, 55)

In [46]:
# limit data to after 2007
# all countries
marburg_virus_data_clean_concat_and_indicator_all_countries_2007 = marburg_virus_data_clean_concat_and_indicator_all_countries[marburg_virus_data_clean_concat_and_indicator_all_countries['Publication Year'] > 2006]
#save data
marburg_virus_data_clean_concat_and_indicator_all_countries_2007.to_csv(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_all_countries_2007.csv')
marburg_virus_data_clean_concat_and_indicator_all_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_all_countries_2007.xls')
marburg_virus_data_clean_concat_and_indicator_all_countries_2007.head(2)


  marburg_virus_data_clean_concat_and_indicator_all_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_all_countries_2007.xls')


Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies_africa,wb_low_income_economies_others,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
203,wos:000243562000002,,,w b saunders co ltd,10,1,11,2007.0,university of london; london school of hygiene...,"univ london london sch hyg & trop med, dept in...",...,,,,,,,,,1.0,1.0
204,wos:000259305000001,,,springer india,2,1,15,2007.0,sher-e-kashmir university of agricultural scie...,"[taku, anil kumar; bhat, mohd altaf; dutta, ta...",...,,,1.0,,1.0,,,,,


In [47]:
marburg_virus_data_clean_concat_and_indicator_all_countries_2007.shape

(714, 167)

In [48]:
# limit data to after 2007
# first authors countries
marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007 = marburg_virus_data_clean_concat_and_indicator_first_author_countries[marburg_virus_data_clean_concat_and_indicator_first_author_countries['Publication Year'] > 2006]
#save data
marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.to_csv(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.csv')
marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.xls')
marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.head(2)


  marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.xls')


Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
203,wos:000243562000002,,,w b saunders co ltd,10,1,11,2007.0,university of london; london school of hygiene...,"univ london london sch hyg & trop med, dept in...",...,,,,,,,,,,
204,wos:000259305000001,,,springer india,2,1,15,2007.0,sher-e-kashmir university of agricultural scie...,"[taku, anil kumar; bhat, mohd altaf; dutta, ta...",...,,,1.0,,1.0,,,,,


In [49]:
marburg_virus_data_clean_concat_and_indicator_first_author_countries_2007.shape

(714, 125)

In [50]:
# limit data to after 2007
# last authors countries
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007 = marburg_virus_data_clean_concat_and_indicator_last_author_countries[marburg_virus_data_clean_concat_and_indicator_last_author_countries['Publication Year'] > 2006]
#save data
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.to_csv(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.csv')
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.xls')
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.head(2)


  marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.to_excel(f'../TABLES_gender/marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.xls')


Unnamed: 0,wos_ID,Funding Name Preferred,Open Access Designations,Publisher,"Times Cited, All Databases",180 Day Usage Count,Since 2013 Usage Count,Publication Year,Affiliations,Addresses,...,wb_low_income_economies,wb_low_income_economies_africa,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others
203,wos:000243562000002,,,w b saunders co ltd,10,1,11,2007.0,university of london; london school of hygiene...,"univ london london sch hyg & trop med, dept in...",...,,,,,,,,,,
204,wos:000259305000001,,,springer india,2,1,15,2007.0,sher-e-kashmir university of agricultural scie...,"[taku, anil kumar; bhat, mohd altaf; dutta, ta...",...,,,1.0,,1.0,,,,,


In [51]:
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007.shape

(714, 129)

In [52]:
marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007[['female_female', 'female_male',
       'female_unclassified', 'female_unknown', 'male_female', 'male_male',
       'male_unclassified', 'male_unknown', 'unclassified_female',
       'unclassified_male', 'unclassified_unclassified', 'unknown_female',
       'unknown_male', 'unknown_unknown']].count()

female_female                 72
female_male                  212
female_unclassified            2
female_unknown                 1
male_female                   64
male_male                    339
male_unclassified              2
male_unknown                   1
unclassified_female            1
unclassified_male             10
unclassified_unclassified      8
unknown_female                 1
unknown_male                   1
unknown_unknown                0
dtype: int64

In [53]:
# marburg_virus_data_clean_concat_and_indicator_last_author_countries_2007['number_of_institution']