In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats, integrate
%matplotlib inline

In [90]:
Wellcome = pd.DataFrame()

In [91]:
Wellcome = Wellcome.append(pd.read_csv('WellcomeData.csv'))

In [92]:
Wellcome.head()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [93]:
#The first thing I had to do was re-save the .csv file in UTF-8! 
#I went to Save with Encoding in my text editor menu then chose UTF-8.
#It was the British Pound symbol that created the first error message while parsing.

In [94]:
Wellcome.columns

Index(['PMID/PMCID', 'Publisher', 'Journal title', 'Article title',
       'COST (£) charged to Wellcome (inc VAT when charged)'],
      dtype='object')

In [95]:
Wellcome.columns = ('pmid','publisher','journal_title','article_title','cost')

In [96]:
#Remove the non-numeric character (Pound symbol) from Open Access Cost column
Wellcome.cost = Wellcome.cost.apply(lambda x: ''.join(list(filter(str.isdigit, str(x)))))

In [97]:
Wellcome.head()

Unnamed: 0,pmid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,238104
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",64256
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,66964
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,68588


In [98]:
#Convert to Pounds from Pence
Wellcome.cost = Wellcome.cost.apply(lambda x: int(x)//100)

In [99]:
Wellcome.head()

Unnamed: 0,pmid,publisher,journal_title,article_title,cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,0
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,2381
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",642
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,669
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,685


In [100]:
Wellcome.cost.describe()

count      2127.000000
mean      24062.186178
std      146861.479624
min           0.000000
25%        1276.000000
50%        1883.000000
75%        2321.000000
max      999999.000000
Name: cost, dtype: float64

In [101]:
FindOutlier=Wellcome.cost.apply(lambda x:x!=999999)

In [102]:
FindOutlier.head()

0    True
1    True
2    True
3    True
4    True
Name: cost, dtype: bool

In [103]:
Wellcome = Wellcome.loc[FindOutlier,:]

In [104]:
Wellcome.describe()

Unnamed: 0,cost
count,2080.0
mean,2009.767788
std,6101.403123
min,0.0
25%,1260.0
50%,1851.5
75%,2302.0
max,201024.0


In [105]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('ACS ','')

In [106]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('Journal of ','')

In [107]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('The Journal of ','')

In [108]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('Proceedings of the National Academy of Sciences','PNAS')

In [109]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('P.N.A.S.','PNAS')

In [110]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('J ','')

In [112]:
Wellcome.journal_title = Wellcome.journal_title.str.replace('Med Chem','Medicinal Chemistry')

In [113]:
Wellcome.journal_title = Wellcome.journal_title.str.lower()

In [114]:
Wellcome.groupby('journal_title').describe()

Unnamed: 0_level_0,cost,cost,cost,cost,cost,cost,cost,cost
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
journal_title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
abnormal psychology,1.0,2534.000000,,2534.0,2534.00,2534.0,2534.00,2534.0
academy of nutrition and dietetics,1.0,2379.000000,,2379.0,2379.00,2379.0,2379.00,2379.0
acquired immune deficiency syndromes,1.0,2034.000000,,2034.0,2034.00,2034.0,2034.00,2034.0
acquired immune deficiency syndroms (jaids),1.0,1836.000000,,1836.0,1836.00,1836.0,1836.00,1836.0
"acta crystallographica section d, biological crystallography",1.0,771.000000,,771.0,771.00,771.0,771.00,771.0
acta crystallographica section d: biological crystallography,1.0,773.000000,,773.0,773.00,773.0,773.00,773.0
acta crystallographica section f: structural biology and crystallization communications,2.0,796.000000,15.556349,785.0,790.50,796.0,801.50,807.0
"acta crystallographica, section d",1.0,757.000000,,757.0,757.00,757.0,757.00,757.0
acta crystallography d,1.0,774.000000,,774.0,774.00,774.0,774.00,774.0
acta d,1.0,750.000000,,750.0,750.00,750.0,750.00,750.0
