In [588]:
import pandas as pd
import numpy as np
import re


In [589]:
df=pd.read_csv('WELLCOME_Thinkful.csv')

In [590]:
df.head() #Let's take a quick peek

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


To complete this challenge, determine the five most common journals and the total articles for each. Next, calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [591]:
#First let's change some column names to make it more readable. 
df.rename(index=str, columns={"Journal title": "Journal", "Article title":"Article", 
"COST (£) charged to Wellcome (inc VAT when charged)":"Cost", "PMID/PMCID": "PMID"}, inplace=True)

In [592]:
df.head()

Unnamed: 0,PMID,Publisher,Journal,Article,Cost
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88


In [595]:
#fixing Cost Column
df.Cost.head()

0       £0.00
1    £2381.04
2     £642.56
3     £669.64
4     £685.88
Name: Cost, dtype: object

In [598]:
df= df.apply(lambda x: x.str.replace('$',''))
df= df.apply(lambda x: x.str.replace('£',''))
#removed $ and £ sign.

In [617]:
df['Cost'] = df['Cost'].apply(lambda x: float(x)) #just to be sure everything is float

In [618]:
df.Cost.head()

0       0.00
1    2381.04
2     642.56
3     669.64
4     685.88
Name: Cost, dtype: float64

In [601]:
df.Cost.max() #this is obviously wrong

999999.0

In [602]:
df.loc[(df.Cost==999999.0)].shape[0] 

47

In [603]:
df['Cost'] = df['Cost'].replace(999999.00, 0) #debatable whether to replace with 0, or NAN or just drop. 
#Too many to drop

In [604]:
df.Cost.max() #that's better

201024.0

In [605]:
#moving on to Journal column
df.Journal.unique()

array(['Psychological Medicine', 'Biomacromolecules', 'J Med Chem',
       'J Org Chem', 'Journal of Medicinal Chemistry',
       'Journal of Proteome Research', 'Mol Pharm',
       'ACS Chemical Biology',
       'Journal of Chemical Information and Modeling', 'Biochemistry',
       'Gastroenterology', 'Journal of Biological Chemistry',
       'Journal of Immunology', 'ACS Chemical Neuroscience', 'ACS NANO',
       'American Chemical Society', 'Analytical Chemistry',
       'Bioconjugate Chemistry', 'Journal of Medicinal Chemistry ',
       'Journal of the American Chemical Society', 'ACS Nano', 'CHEST',
       'Journal of Neurophysiology', 'Journal of Physiology',
       'The Journal of Neurophysiology', 'American Journal of Psychiatry',
       'Americal Journal of Psychiatry', 'Behavioral Neuroscience',
       'Emotion', 'Health Psychology', 'Journal of Abnormal Psychology',
       'Journal of Consulting and Clinical Psychology',
       'Journal of Experimental Psychology:  Animal Be

In [623]:
# since it wants to us to find the most common Journal, we first focus on the journal column. 
#After changing the title, I've decided to make it all uppercase and strip whitespace
df['Journal'] = df['Journal'].str.upper()
df1=df.Journal.str.replace("J ", "JOURNAL")
df1=df1.str.replace(",", "")
df1=df1.str.replace(":", "")
df1=df1.str.replace("THE", "")
df1=df1.str.replace("JNL BIOLOGICAL CHEMISTRY", "JOURNAL BIOLOGICAL CHEMISTRY")
df1=df1.str.replace("JOURNAL OF BIOL CHEM", "JOURNAL OF BIOLOGICAL CHEMISTRY")
#for some reason, df1=df.str.strip() gave me a different value, so I switched to lambda
df1= df1.apply(lambda x: str(x).strip())

In [625]:
df1.value_counts().head(5)

PLOS ONE                           190
JOURNAL OF BIOLOGICAL CHEMISTRY     61
NEUROIMAGE                          29
NUCLEIC ACIDS RESEARCH              26
PLOS PATHOGENS                      24
Name: Journal, dtype: int64

Now we see that the top Journals with their article counts.
Now do some calculations.

In [629]:
#create a new df to store wanted variables
df2 = pd.DataFrame(dtype=float)
df2['Journal']=df1
df2['Cost']=df.Cost

In [628]:
df2.head()

Unnamed: 0,Journal,Cost
0,PSYCHOLOGICAL MEDICINE,0.0
1,BIOMACROMOLECULES,2381.04
2,JOURNALMED CHEM,642.56
3,JOURNALMED CHEM,669.64
4,JOURNALORG CHEM,685.88


In [616]:
#df_count = df2.groupby('Journal', as_index = False).count()
#df_count
#was thinking of making another data frame with count, mean, median and std but it did not work. 

In [630]:
df_desc = df2.groupby('Journal')['Cost'].describe() 

In [631]:
print(df_desc.loc['PLOS ONE'])

print(df_desc.loc['JOURNAL OF BIOLOGICAL CHEMISTRY'])

print(df_desc.loc['NEUROIMAGE'])

print(df_desc.loc['NUCLEIC ACIDS RESEARCH'])

print(df_desc.loc['PLOS PATHOGENS'])

count       190.000000
mean       1909.222947
std       13913.163633
min           0.000000
25%         843.735000
50%         894.070000
75%        1040.577500
max      192645.000000
Name: PLOS ONE, dtype: float64
count      61.000000
mean     1386.751148
std       425.826779
min         0.000000
25%      1152.720000
50%      1311.730000
75%      1586.360000
max      2501.070000
Name: JOURNAL OF BIOLOGICAL CHEMISTRY, dtype: float64
count      29.000000
mean     2215.168276
std       266.653947
min      1747.160000
25%      2100.540000
50%      2326.430000
75%      2396.570000
max      2503.340000
Name: NEUROIMAGE, dtype: float64
count      26.000000
mean     1149.000000
std       442.940447
min       710.000000
25%       852.000000
50%       852.000000
75%      1704.000000
max      2184.000000
Name: NUCLEIC ACIDS RESEARCH, dtype: float64
count      24.000000
mean     1441.794583
std       470.205131
min         0.000000
25%      1429.837500
50%      1585.560000
75%      1726.170000
ma