Using this dataset of article open-access prices paid by the WELLCOME Trust between 2012 and 2013, determine the five most common journals and the total articles for each.

In [1]:
import pandas as pd
import chardet
import scipy.stats

#Check encoding
rawdata = open('WELLCOME_APCspend2013_forThinkful.csv', "rb").read()
chardet.detect(rawdata)

#Load data
df = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='IBM857')

#Make journal names all uppercase
df['Journal title'] = df['Journal title'].str.upper()

#Print the five most common journals and the total articles for each
top5 = df['Journal title'].value_counts().nlargest(5)
print(top5)


PLOS ONE                           190
JOURNAL OF BIOLOGICAL CHEMISTRY     53
NEUROIMAGE                          29
PLOS GENETICS                       24
PLOS PATHOGENS                      24
Name: Journal title, dtype: int64


Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [3]:
#Rename the cost column's title
df.rename(columns={'COST (ú) charged to Wellcome (inc VAT when charged)':'cost'}, inplace=True)

#Remove special character in the cost column
df['cost'] = df['cost'].str.replace('(ú)', '')
df['cost'] = df['cost'].str.replace('$', '')

#Select data from the five most common journals
df2 = df.loc[df['Journal title'].isin(list(top5.index))]

#Change cost column type from object to float
df2['cost']=df2['cost'].apply(pd.to_numeric)

#Remove unreasonably high values, aka winsorizing.
df2['cost'] = scipy.stats.mstats.winsorize(df2['cost'], limits = 0.05)

#Get the mean, median, and standard deviation of the open-access cost per article for each journal
mean = df2.groupby(['Journal title'])['cost'].mean()
print('Mean: \n{} \n'.format(mean))

median = df2.groupby(['Journal title'])['cost'].median()
print('Median: \n{} \n'.format(median))

mode = df2.groupby(['Journal title'])['cost'].apply(lambda x: x.mode())
print('Mode: \n{}'.format(mode))

Mean: 
Journal title
JOURNAL OF BIOLOGICAL CHEMISTRY    1453.849057
NEUROIMAGE                         2213.850345
PLOS GENETICS                      1713.204167
PLOS ONE                           1018.791947
PLOS PATHOGENS                     1648.813750
Name: cost, dtype: float64 

Median: 
Journal title
JOURNAL OF BIOLOGICAL CHEMISTRY    1314.530
NEUROIMAGE                         2326.430
PLOS GENETICS                      1718.390
PLOS ONE                            898.475
PLOS PATHOGENS                     1600.520
Name: cost, dtype: float64 

Mode: 
Journal title                     
JOURNAL OF BIOLOGICAL CHEMISTRY  0    1276.08
                                 1    1556.61
                                 2    2484.23
NEUROIMAGE                       0    2484.23
PLOS GENETICS                    0    1456.19
                                 1    1807.60
                                 2    2484.23
PLOS ONE                         0     825.68
PLOS PATHOGENS                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
