In [1]:
import pandas as pd
import numpy as np
import re
from functools import reduce
import seaborn as sns
import missingno as msno
%matplotlib inline

import matplotlib.pyplot as plt
from scipy.stats import norm


In [2]:
# Set the figure size - handy for larger output
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [10, 6]
# Set up with a higher resolution screen (useful on Mac)
%config InlineBackend.figure_format = 'retina'

## import data

In [3]:
data = pd.read_pickle('D:\MARBURG VIRUS DISEASES\DATA_PREPROCESSING/data_concat_countries_and_wb_groups_income.pkl')#.drop(['Addresses'], axis=1)

data.head(2)

Unnamed: 0,wos_ID,Addresses,Afghanistan,United States,Angola,Argentina,Australia,Austria,Bangladesh,Belgium,...,wb_lower_middle_income_economies,wb_lower_middle_income_economies_africa,wb_lower_middle_income_economies_others,wb_upper_middle_income_economies,wb_upper_middle_income_economies_africa,wb_upper_middle_income_economies_others,wb_high_income_economies,wb_high_income_economies_others,funding_yes,open_access_yes
0,wos:000904661800001,"[Bi, Jinhao; Xia, Xianzhu] Jilin Agr Univ, Col...",,,,,,,,,...,,,,1.0,,1.0,,,0,1
1,wos:000921279800001,"[Ye, Xin; Holland, Richard; Wood, Mark; Pasetk...",,2.0,,,,,,,...,,,,,,,2.0,2.0,1,1


In [4]:
data.count().head(10)

wos_ID           932
Addresses        876
Afghanistan        1
United States    586
Angola             2
Argentina          1
Australia         18
Austria            4
Bangladesh         5
Belgium           24
dtype: int64

# The Geographic contribution to Marburg Virus Diseases publications

## Proportion of publications by countries

In [5]:
countries_list = data.count().to_frame().sort_values(by=0, ascending=False)
countries_list['%'] = countries_list[0]/countries_list.T['Addresses'][0]*100
# save data
countries_list.to_csv(f'../TABLES/countries_list.csv')

countries_list.head(60)

Unnamed: 0,0,%
wos_ID,932,106.392694
funding_yes,932,106.392694
open_access_yes,932,106.392694
year_group,927,105.821918
Publication Year,927,105.821918
Since 2013 Usage Count,895,102.16895
Addresses,876,100.0
"Times Cited, All Databases",871,99.429224
non_african_countries,860,98.173516
wb_high_income_economies_others,820,93.607306


## Proportion of publications  by countries and funding


In [6]:
countries_fund_list = data.reset_index().groupby(['funding_yes']).count().T
countries_fund_list.rename(columns = {0:'no_fund', 
                              1:'fund'}, inplace = True)

countries_fund_list['total'] = countries_fund_list['no_fund'] + countries_fund_list['fund']

for col in countries_fund_list.columns:
    countries_fund_list['%_'+col] = countries_fund_list[col]/countries_fund_list['total']*100
    
countries_fund_list['%_diff(%_fund-%_no_fund)'] = countries_fund_list['%_fund'] - countries_fund_list['%_no_fund']

# save data
countries_fund_list.to_csv(f'../TABLES/countries_fund_list.csv')

countries_fund_list

funding_yes,no_fund,fund,total,%_no_fund,%_fund,%_total,%_diff(%_fund-%_no_fund)
index,355,577,932,38.090129,61.909871,100.0,23.819742
wos_ID,355,577,932,38.090129,61.909871,100.0,23.819742
Addresses,299,577,876,34.132420,65.867580,100.0,31.735160
Afghanistan,1,0,1,100.000000,0.000000,100.0,-100.000000
United States,152,434,586,25.938567,74.061433,100.0,48.122867
...,...,...,...,...,...,...,...
wb_upper_middle_income_economies_africa,19,19,38,50.000000,50.000000,100.0,0.000000
wb_upper_middle_income_economies_others,21,66,87,24.137931,75.862069,100.0,51.724138
wb_high_income_economies,274,546,820,33.414634,66.585366,100.0,33.170732
wb_high_income_economies_others,274,546,820,33.414634,66.585366,100.0,33.170732


In [7]:
countries_list = [ 'Afghanistan', 'United States', 'Angola', 'Argentina', 'Australia', 'Austria', 'Bangladesh', 'Belgium', 'Belize', 'Brazil', 'Bulgaria', 'Burkina Faso',
 'Canada', 'Central African Republic', 'Colombia', 'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Czech Republic',
 'DR Congo', 'Denmark', 'Ecuador', 'Egypt', 'United Kingdom', 'Germany', 'Finland', 'France', 'Gabon', 'Ghana', 'Greece', 'Guinea', 'Honduras', 'Hungary',
 'India', 'Indonesia', 'Iran', 'Iraq', 'Israel', 'Italy', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Lebanon', 'Malawi', 'Malaysia', 'Mali', 'Morocco',
 'Nepal', 'Netherlands', 'New Zealand', 'Nigeria', 'Norway', 'Pakistan', 'Palestine', 'Panama', 'China', 'Peru', 'Philippines', 'Poland', 'Portugal',
 'Congo Republic', 'Russia', 'Saudi Arabia', 'Senegal', 'Serbia', 'Sierra Leone', 'Singapore', 'Slovakia', 'South Africa', 'South Korea', 'Spain', 'Sweden',
 'Switzerland', 'Tanzania', 'Thailand', 'Turkey', 'United Arab Emirates', 'Uganda', 'Ukraine', 'Venezuela', 'Vietnam', 'Zambia', 'Zimbabwe', 'Sudan']

In [8]:
region_wb = ['wb_east_asia_and_pacific', 'wb_europe_and_central_asia', 'wb_latin_america_and_the_caribbean', 'wb_middle_east_and_north_africa', 
               'wb_middle_east_and_north_africa_africa', 'wb_middle_east_and_north_africa_others', 'wb_north_america', 'wb_south_asia', 'wb_sub_saharan_africa',
                 ]

In [9]:
income_wb = ['wb_low_income_economies',  'wb_lower_middle_income_economies', 
               'wb_upper_middle_income_economies', 'wb_high_income_economies'
                  ]

In [10]:
countries_fund_list.reset_index()[countries_fund_list.reset_index()['index'].isin(countries_list)].describe()

funding_yes,no_fund,fund,total,%_no_fund,%_fund,%_total,%_diff(%_fund-%_no_fund)
count,85.0,85.0,85.0,85.0,85.0,85.0,85.0
mean,5.623529,11.282353,16.905882,41.434877,58.565123,100.0,17.130246
std,18.103523,48.243581,65.799049,38.086915,38.086915,0.0,76.17383
min,0.0,0.0,1.0,0.0,0.0,100.0,-100.0
25%,0.0,1.0,1.0,0.0,33.333333,100.0,-33.333333
50%,1.0,1.0,3.0,41.610738,58.389262,100.0,16.778523
75%,3.0,6.0,8.0,66.666667,100.0,100.0,100.0
max,152.0,434.0,586.0,100.0,100.0,100.0,100.0


In [11]:
countries_fund_list.reset_index()[countries_fund_list.reset_index()['index'].isin(region_wb)].describe()

funding_yes,no_fund,fund,total,%_no_fund,%_fund,%_total,%_diff(%_fund-%_no_fund)
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,44.555556,91.888889,136.444444,45.923134,54.076866,100.0,8.153732
std,60.291192,146.93829,202.920619,21.667549,21.667549,0.0,43.335099
min,3.0,1.0,7.0,18.253968,12.5,100.0,-75.0
25%,7.0,6.0,12.0,33.333333,44.444444,100.0,-11.111111
50%,13.0,8.0,19.0,42.857143,57.142857,100.0,14.285714
75%,45.0,103.0,126.0,55.555556,66.666667,100.0,33.333333
max,159.0,452.0,611.0,87.5,81.746032,100.0,63.492063


In [12]:
countries_fund_list.reset_index()[countries_fund_list.reset_index()['index'].isin(income_wb)].describe()

funding_yes,no_fund,fund,total,%_no_fund,%_fund,%_total,%_diff(%_fund-%_no_fund)
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,91.75,176.75,268.5,37.475935,62.524065,100.0,25.048131
std,121.606949,247.115864,368.55167,9.553771,9.553771,0.0,19.107542
min,26.0,27.0,56.0,32.20339,48.214286,100.0,-3.571429
25%,28.25,47.25,74.0,32.425847,61.992596,100.0,23.985192
50%,33.5,67.0,99.0,32.957317,67.042683,100.0,34.085366
75%,97.0,196.5,293.5,38.007404,67.574153,100.0,35.148305
max,274.0,546.0,820.0,51.785714,67.79661,100.0,35.59322
