In [1]:
import re
import pandas as pd
import math
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt

def is_a_string(x):
        return str(x).isdigit()

file_location = "WELLCOME_APCspend2013_forThinkful.csv"

#added encoding='latin1' to prevent utf-8 error 
df = pd.read_csv(file_location, header=0, quotechar='"', encoding='latin1')
df.dropna(inplace=True)
df.head(10)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,Journal of Medicinal Chemistry,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,Journal of Proteome Research,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,Mol Pharm,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,Chemical proteomic analysis reveals the drugab...,£1294.78
10,PMCID: PMC3739413,ACS (Amercian Chemical Society) Publications,Journal of Chemical Information and Modeling,Locating Sweet Spots for Screening Hits and Ev...,£1329.69


In [2]:
df['PMID/PMCID'].value_counts()

In Process                                          7
Not yet available                                   7
-                                                   7
pub Aug 2013                                        3
Epub ahead of print pub Jan 2013, print in press    3
print in press                                      3
Pub July 2013                                       3
Epub ahead of print pub Feb 2013, print in press    3
Epub ahead of print - june 2013                     3
PMC 3413714                                         2
22735079                                            2
Pending                                             2
PMC 3599138                                         2
Epub ahead of print April 2013 - print in press     2
PMC3597274                                          2
41609                                               2
not yet published                                   2
PMC3708033\n\n                                      2
PMC3405234                  

Using .value_counts we can see what some of the issues in our data are. For one, there are far too many letters that pollute the data to the point in which even similar PMC IDs are different. Thus to make it easier it'd be best if we just did away with all the letters within the PMC ID column. There are some variables that we can confirm:
1. PMC IDs only consist of integers
2. PMC IDs have the length of 7

In [3]:
#From this data we can see a variable of issues with the data. 
#Conditions for it to be a PMID/PMCID...Only numbers (no letters), 7 digits long
#------Failed Attempts
##df['PMID/PMCID'].apply(lambda x: re.sub('^[0-9]*', '', x)) #Get rid of all letters
##df['PMID/PMCID'].apply(lambda x: x.strip) #Get rid of all white spaces

df['PMID/PMCID'] = df['PMID/PMCID'].apply(lambda x: ''.join(list(filter(str.isdigit, str(x))))) #retrieve only the integers
df['PMID/PMCID'] = df['PMID/PMCID'].apply(lambda x: x if len(x) == 7 else (x[-7:] if len(x) > 7 else '')) #if the amount of digits isn't 7 then set to NaN
df['PMID/PMCID'].value_counts()

           82
3599488     2
3381227     2
3676342     2
3529206     2
2735079     2
3708033     2
3647051     2
3405234     2
3401426     2
3746156     2
3597274     2
3613719     2
3435256     2
3528370     2
3413714     2
3599138     2
3173209     2
3152927     1
2877799     1
3597032     1
3778837     1
3505371     1
3786664     1
3643589     1
3607254     1
3815011     1
3526451     1
3608034     1
3775257     1
           ..
3457933     1
3692470     1
3636053     1
3529057     1
3528594     1
3444305     1
3490334     1
3625108     1
3534503     1
3639206     1
3274377     1
3662891     1
3201205     1
3675710     1
3492727     1
3597375     1
3544544     1
3794160     1
3790930     1
3488092     1
3534410     1
3662417     1
3754066     1
3484397     1
3765962     1
3734351     1
3313792     1
3355306     1
3600839     1
3629389     1
Name: PMID/PMCID, Length: 1830, dtype: int64

In [4]:
df['Publisher'].value_counts()

Elsevier                                                                             344
Public Library of Science                                                            265
Wiley                                                                                110
Oxford University Press                                                               73
Springer                                                                              72
Wiley-Blackwell                                                                       55
OUP                                                                                   45
ASBMB                                                                                 45
Nature Publishing Group                                                               44
BioMed Central                                                                        40
BMC                                                                                   23
Nature               

For publishers, I'd like to find the first letter of each word (seperated by spaces), and return  
If the amount of words is greater then 1, then it is not a abbreviation and then can be made into an abbreviation

Retrieve only the capital letters of each word to make into an abbreviation  (gets rid of the 'of', the, and, for)

In [5]:
def abbreviate(x):
    x = x.upper() #capitalize everything
    answer = ''
    for i in x.split():
        if i == 'OF' or i == 'AND' or i == 'THE' or i == '&' or i == 'FOR':
            pass
        else:
            answer = answer + i[0]
    return answer
        
#print(abbreviate("National Academy of Sciences"))
df['Publisher'] = df['Publisher'].apply(lambda x: x if len(x.split()) == 1 else abbreviate(x))
df['Publisher'].value_counts()

Elsevier           344
PLS                265
OUP                131
Wiley              110
Springer            72
ASBMB               67
BC                  60
Wiley-Blackwell     55
NPG                 49
RS                  28
CUP                 24
BMC                 23
Nature              23
ASM                 22
CB                  22
ACS                 21
BMJ                 20
DJS                 19
Frontiers           17
ES                  17
NAS                 16
SGM                 16
WSSI                16
PLoS                15
LB                  15
OJ                  14
TF                  14
APA                 14
BG                  13
Sage                10
                  ... 
PC                   1
IIP                  1
OARL                 1
KARGER               1
SE                   1
SGHP3DB              1
Pion                 1
SGC                  1
CPS                  1
B                    1
Wiley/Blackwell      1
A/                   1
SRP        

In [6]:
#Same as Publisher
df['Journal title'] = df['Journal title'].apply(lambda x: x if len(x.split()) == 1 else abbreviate(x))
df['Journal title'].value_counts()

PO                    188
JBC                    70
JN                     30
NAR                    28
PNAS                   28
PP                     24
PG                     23
PNTD                   22
BO                     21
NC                     19
BJ                     18
HMG                    17
FJ                     16
CR                     16
CC                     16
JID                    14
NeuroImage             14
PM                     14
Neuroimage             14
BPH                    13
MD                     13
JMC                    13
JP                     13
CB                     13
BJP                    11
Brain                  11
JGV                    11
MJ                     10
AJHG                   10
AN                     10
                     ... 
BHM                     1
JRSSSC(S                1
Neurogenetics           1
DB                      1
NB                      1
AHG                     1
NJ                      1
Centaurus   

In [7]:
df['COST (£) charged to Wellcome (inc VAT when charged)'].value_counts()

£2040.00      90
£999999.00    40
£1500.00      26
£2100.00      26
£2400.00      25
£1800.00      24
£3000.00      20
£1700.00      20
£1834.77      18
£825.68       18
£852.00       15
£1680.00      15
£2034.00      12
£3120.00      11
£2010.24      11
£1620.00      10
£1920.00      10
£1536.00       8
£2340.00       8
£1704.00       8
£1260.00       8
£2520.00       6
£2184.22       6
£2377.65       6
£1068.00       6
£1044.00       6
£1836.92       6
£2625.60       6
£1939.95       6
£2145.60       5
              ..
£1371.54       1
£2388.08       1
£1125.00       1
£1966.31       1
£1283.00       1
£601.70        1
£1314.53       1
£2409.80       1
£758.53        1
£1639.06       1
£1475.40       1
£2048.93       1
£1267.76       1
£1152.50       1
£1761.28       1
£1632.89       1
£1288.00       1
£1296.00       1
£1851.15       1
£2260.80       1
£1758.89       1
£2192.89       1
£2476.42       1
£1100.00       1
£1438.45       1
£2127.68       1
£729.30        1
£1759.38      