In [1]:
import re
import pandas as pd
import math
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt

def is_a_string(x):
        return str(x).isdigit()

file_location = "WELLCOME_APCspend2013_forThinkful.csv"

#added encoding='latin1' to prevent utf-8 error 
df = pd.read_csv(file_location, header=0, quotechar='"', encoding='latin1')
df.head(10)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,Journal of Medicinal Chemistry,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,Journal of Proteome Research,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,Mol Pharm,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,Chemical proteomic analysis reveals the drugab...,£1294.78


In [2]:
df['PMID/PMCID'].value_counts()

In Process                                          7
-                                                   7
Not yet available                                   7
print in press                                      3
Epub ahead of print pub Feb 2013, print in press    3
pub Aug 2013                                        3
Epub ahead of print pub Jan 2013, print in press    3
Pub July 2013                                       3
Epub ahead of print - june 2013                     3
PMC 3413714                                         2
PMC3173209                                          2
PMC in progress                                     2
PMC 3435256                                         2
Epub ahead of print April 2013 - print in press     2
Pending                                             2
PMCID:\n    PMC3647051\n                            2
PMC3746156                                          2
PMC3597274                                          2
PMC3529206                  

Using .value_counts we can see what some of the issues in our data are. For one, there are far too many letters that pollute the data to the point in which even similar PMC IDs are different. Thus to make it easier it'd be best if we just did away with all the letters within the PMC ID column. There are some variables that we can confirm:
1. PMC IDs only consist of integers
2. PMC IDs have the length of 7

In [39]:
#From this data we can see a variable of issues with the data. 
#Conditions for it to be a PMID/PMCID...Only numbers (no letters), 7 digits long
#------Failed Attempts
##df['PMID/PMCID'].apply(lambda x: re.sub('^[0-9]*', '', x)) #Get rid of all letters
##df['PMID/PMCID'].apply(lambda x: x.strip) #Get rid of all white spaces

df['PMID/PMCID'] = df['PMID/PMCID'].apply(lambda x: ''.join(list(filter(str.isdigit, str(x))))) #retrieve only the integers
df['PMID/PMCID'] = df['PMID/PMCID'].apply(lambda x: x if len(x) == 7 else (x[-7:] if len(x) > 7 else '')) #if the amount of digits isn't 7 then set to NaN
df['PMID/PMCID'].value_counts()

           281
3676342      2
3529206      2
3405234      2
3413714      2
3599488      2
3599138      2
3647051      2
3597274      2
3708033      2
3746156      2
3613719      2
3528370      2
3173209      2
3381227      2
3435256      2
3401426      2
2735079      2
3527725      1
3628103      1
3707280      1
3330789      1
3484687      1
4125554      1
3759847      1
3535400      1
3633239      1
3661540      1
3672941      1
3682181      1
          ... 
3566929      1
3685808      1
3761323      1
3534410      1
3682146      1
3522294      1
3630861      1
3750825      1
3521128      1
3607463      1
3460225      1
3644070      1
3467080      1
3444304      1
3137731      1
3685810      1
3391134      1
3308065      1
3161179      1
3819872      1
3782536      1
3728563      1
3607215      1
3528594      1
3592866      1
3619528      1
3532612      1
3782406      1
3531601      1
0146481      1
Name: PMID/PMCID, Length: 1830, dtype: int64

In [4]:
df['Publisher'].value_counts()

Elsevier                                                      387
Public Library of Science                                     278
Wiley                                                         136
Springer                                                       81
Oxford University Press                                        77
Wiley-Blackwell                                                56
OUP                                                            56
ASBMB                                                          46
Nature Publishing Group                                        45
BioMed Central                                                 40
BMC                                                            26
Nature                                                         24
Frontiers                                                      23
BMJ                                                            23
Royal Society                                                  22
Cambridge 

For publishers, I'd like to find the first letter of each word (seperated by spaces), and return  
If the amount of words is greater then 1, then it is not a abbreviation and then can be made into an abbreviation

Retrieve only the capital letters of each word to make into an abbreviation  (gets rid of the 'of', the, and, for)

In [None]:
def abbreviate(x):
    x.split()

df['Publisher'] = df['Publisher'].apply(lambda x: x if len(x.split()) == 1 else ()

In [5]:
df['Journal title'].value_counts()

PLoS One                                                          92
PLoS ONE                                                          62
Journal of Biological Chemistry                                   48
Nucleic Acids Research                                            21
Proceedings of the National Academy of Sciences                   19
PLoS Neglected Tropical Diseases                                  18
Human Molecular Genetics                                          18
Nature Communications                                             17
Neuroimage                                                        15
PLoS Genetics                                                     15
PLoS Pathogens                                                    15
Brain                                                             14
BMC Public Health                                                 14
NeuroImage                                                        14
PLOS ONE                          

In [6]:
#Same as Publisher
