# Challenge - Data cleaning and validation

In [246]:
import re

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [247]:
# Build DataFrame from csv file
file = 'WELLCOME_APCspend2013_forThinkful.csv'
wellcome = pd.read_csv(file, encoding='ISO-8859-1')

In [248]:
wellcome

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56
3,23438330 PMC3646402,ACS,J Med Chem,Orvinols with mixed kappa/mu opioid receptor a...,£669.64
4,23438216 PMC3601604,ACS,J Org Chem,Regioselective opening of myo-inositol orthoes...,£685.88
5,PMC3579457,ACS,Journal of Medicinal Chemistry,Comparative Structural and Functional Studies ...,£2392.20
6,PMC3709265,ACS,Journal of Proteome Research,Mapping Proteolytic Processing in the Secretom...,£2367.95
7,23057412 PMC3495574,ACS,Mol Pharm,Quantitative silencing of EGFP reporter gene b...,£649.33
8,PMCID: PMC3780468,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,A Novel Allosteric Inhibitor of the Uridine Di...,£1294.59
9,PMCID: PMC3621575,ACS (Amercian Chemical Society) Publications,ACS Chemical Biology,Chemical proteomic analysis reveals the drugab...,£1294.78


In [249]:
# Determine DataFrame shape
wellcome.shape

(2127, 5)

In [250]:
# Describe the DataFrame
wellcome.describe()

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
count,1928,2127,2126,2127,2127
unique,1880,299,984,2126,1402
top,-,Elsevier,PLoS One,"Exclusive breastfeeding, diarrhoel morbidity a...",£2040.00
freq,7,387,92,2,94


### 1. Determine the five most common journals and the total articles for each.

In [251]:
# After some quick cleaning, what does a Top 20 list for journals look like?
wellcome['Journal title'] = wellcome['Journal title'].apply(lambda x: str(x).upper().strip().replace('  ', ' '))
top_20 = wellcome['Journal title'].value_counts().head(20)
top_20

PLOS ONE                                           191
JOURNAL OF BIOLOGICAL CHEMISTRY                     53
NEUROIMAGE                                          29
NUCLEIC ACIDS RESEARCH                              26
PLOS PATHOGENS                                      24
PLOS GENETICS                                       24
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     22
PLOS NEGLECTED TROPICAL DISEASES                    20
NATURE COMMUNICATIONS                               19
HUMAN MOLECULAR GENETICS                            19
BMC PUBLIC HEALTH                                   15
MOVEMENT DISORDERS                                  15
BRAIN                                               14
JOURNAL OF NEUROSCIENCE                             13
BIOCHEMICAL JOURNAL                                 12
DEVELOPMENTAL CELL                                  12
JOURNAL OF GENERAL VIROLOGY                         11
CURRENT BIOLOGY                                     11
MALARIA JO

In [258]:
# Make a list of unique journal names prior to any additional cleaning
current = wellcome['Journal title'].apply(lambda x: str(x).upper().strip().replace('  ', ' ')).unique()
choices = sorted(list(current))
choices

['ACADEMY OF NUTRITION AND DIETETICS',
 'ACS CHEMICAL BIOLOGY',
 'ACS CHEMICAL NEUROSCIENCE',
 'ACS NANO',
 'ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL CRYSTALLOGRAPHY',
 'ACTA CRYSTALLOGRAPHICA SECTION D: BIOLOGICAL CRYSTALLOGRAPHY',
 'ACTA CRYSTALLOGRAPHICA SECTION F: STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS',
 'ACTA CRYSTALLOGRAPHICA, SECTION D',
 'ACTA CRYSTALLOGRAPHY D',
 'ACTA D',
 'ACTA DERMATO VENEREOLOGICA',
 'ACTA DIABETOLOGICA',
 'ACTA F',
 'ACTA NEUROPATHOL',
 'ACTA NEUROPATHOLOGICA',
 'ACTA OPTHALMOLOGICA',
 'ACTA PHYSIOL',
 'ADDICTION',
 'ADVANCES IN EXPERIMENTAL MEDICINE AND BIOLOGY',
 'AGE',
 'AGE AND AGEING',
 'AGEING & SOCIETY',
 'AGING CELL',
 'AIDS',
 'AIDS BEHAV',
 'AIDS CARE',
 'AIDS JOURNAL',
 'AIDS RESEARCH AND THERAPY',
 'AIDS UK',
 'ALCOHOL AND ALCOHOLISM',
 'ALIMENTRARY PHARMACOLOGY & THERAPEUTICS',
 'AM J BIOETH',
 'AM J TROP MED HYG',
 'AMERICAL JOURNAL OF PSYCHIATRY',
 'AMERICAN CHEMICAL SOCIETY',
 'AMERICAN ETHNOLOGIST',
 'AMERICAN JNL EPID

In [273]:
# Clean Top 20 using fuzzy matching
for journal in list(top_20.index):
    print("Name: ", journal + '\n')
    fuzzy_matching = [x[0] for x in process.extract(journal, choices, limit=20)]
    list_fuzzy_matching = list(enumerate(fuzzy_matching))
    for item in list_fuzzy_matching:
        print(item)
    answer = None
    while answer not in ['y', 'N']:
        answer = input("Does the name in question match any of the items above? (Enter 'y' or 'N'): ")
    index_best_name = None
    indices_other_names = []
    if answer is 'y':
        while index_best_name not in [x[0] for x in list_fuzzy_matching]:
            index_best_name = int(input("Which number corresponds to the best name? (Enter a number): "))
        while indices_other_names == []:
            indices_other_names = input("Enter number(s) separated by commas: ")
            indices_other_names = [int(n) for n in indices_other_names.split(',')]
            for num in indices_other_names:
                if num not in [x[0] for x in list_fuzzy_matching]:
                    indices_other_names = []
                else:
                    pass
        key = list_fuzzy_matching[index_best_name][1]
        vals = [x[1] for x in list_fuzzy_matching if x[0] in indices_other_names]
        print(key, vals)
        print('\n')
        wellcome['Journal title'] = wellcome['Journal title'].apply(lambda x, k=key, v=vals: k if x in v else x)

Name:  PLOS ONE

(0, 'PLOS ONE')
(1, 'PLOS')
(2, 'PLOS BIOLOGY')
(3, 'PLOS COMPUTATIONAL BIOLOGY')
(4, 'PLOS GENETICS')
(5, 'PLOS MEDICINE')
(6, 'PLOS MEDICINE JOURNAL')
(7, 'PLOS NEGECTED TROPICAL DISEASES')
(8, 'PLOS NEGLECTED TROPICAL DISEASES')
(9, 'PLOS PATHOGENS')
(10, 'PLOS NTD')
(11, 'ANNALS OF NEUROLOGY')
(12, 'BONE')
(13, 'PHILOS TRANS R SOC LONDON B BIOL SCI')
(14, 'BIOINFORMATICS ONLINE')
(15, 'BIOLOGY OPEN')
(16, 'CHILDS NERV SYST')
(17, 'CLINICAL INFECTIOUS DISEASES ONLINE')
(18, 'EUROPEAN HEART JOURNAL ONLINE')
(19, 'EUROPEAN JOURNAL OF NEUROSCIENCE')
Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  JOURNAL OF BIOLOGICAL CHEMISTRY

(0, 'JOURNAL OF BIOLOGICAL CHEMISTRY')
(1, 'ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL CRYSTALLOGRAPHY')
(2, 'ACTA CRYSTALLOGRAPHICA SECTION D: BIOLOGICAL CRYSTALLOGRAPHY')
(3, 'AIDS JOURNAL')
(4, 'AMERICAN JOURNAL OF TROPICAL MEDICINE & HYGIENE')
(5, 'ANALYTICAL CHEMISTRY')
(6, 'ANNALS OF NEUROLOGY')
(7, 

Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  BRAIN

(0, 'BRAIN')
(1, 'BEHAVIOURAL BRAIN RESEARCH')
(2, 'BRAIN AND COGNITION')
(3, 'BRAIN AND LANGUAGE')
(4, 'BRAIN ONLINE')
(5, 'BRAIN RESEARCH')
(6, 'BRAIN STRUCTURE AND FUNCTION')
(7, 'BRAIN TOPOGRAPHY')
(8, 'EXPERIMENTAL BRAIN RESEARCH')
(9, 'FRONTIERS IN BRAIN IMAGING METHODS')
(10, 'GENES BRAIN BEHAVIOUR')
(11, 'HBM JNL HUMAN BRAIN MAPPING')
(12, 'HUMAN BRAIN MAPPING')
(13, 'MOLECULAR BRAIN')
(14, 'MOLECULAR MEMBRANE BIOLOGY')
(15, 'PAIN')
(16, 'NAN')
(17, 'RNA')
(18, 'AGE AND AGEING')
(19, 'AGEING & SOCIETY')
Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  BIOCHEMICAL JOURNAL

(0, 'BIOCHEMICAL JOURNAL')
(1, 'BIOCHEMICAL JOURNALS')
(2, 'THE BIOCHEMICAL JOURNAL')
(3, 'RNA')
(4, 'BIOCHEM JOURNAL')
(5, 'AIDS JOURNAL')
(6, 'AMERICAL JOURNAL OF PSYCHIATRY')
(7, 'AMERICAN JOURNAL FOR CLINICAL NUTRITION')
(8, 'AMERICAN JOURNAL OF BIOETHICS--NEUROSCIENCE')
(9, 'AMER

In [262]:
# Updated Top 20 list prior to more cleaning
top_20 = wellcome['Journal title'].value_counts().head(20)
top_20

PLOS ONE                                           208
JOURNAL OF BIOLOGICAL CHEMISTRY                     65
NEUROIMAGE                                          31
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     29
NUCLEIC ACIDS RESEARCH                              29
PLOS NEGLECTED TROPICAL DISEASES                    24
PLOS PATHOGENS                                      24
PLOS GENETICS                                       24
HUMAN MOLECULAR GENETICS                            20
NATURE COMMUNICATIONS                               19
BRAIN                                               18
BMC PUBLIC HEALTH                                   15
MOVEMENT DISORDERS                                  15
JOURNAL OF NEUROSCIENCE                             15
BIOCHEMICAL JOURNAL                                 15
DEVELOPMENTAL CELL                                  13
JOURNAL OF GENERAL VIROLOGY                         11
CURRENT BIOLOGY                                     11
MALARIA JO

In [265]:
# Make a list of unique journal names prior to more cleaning
updated = wellcome['Journal title'].unique()
updated_choices = sorted(list(updated))
updated_choices

['ACADEMY OF NUTRITION AND DIETETICS',
 'ACS CHEMICAL BIOLOGY',
 'ACS CHEMICAL NEUROSCIENCE',
 'ACS NANO',
 'ACTA CRYSTALLOGRAPHICA SECTION D, BIOLOGICAL CRYSTALLOGRAPHY',
 'ACTA CRYSTALLOGRAPHICA SECTION D: BIOLOGICAL CRYSTALLOGRAPHY',
 'ACTA CRYSTALLOGRAPHICA SECTION F: STRUCTURAL BIOLOGY AND CRYSTALLIZATION COMMUNICATIONS',
 'ACTA CRYSTALLOGRAPHICA, SECTION D',
 'ACTA CRYSTALLOGRAPHY D',
 'ACTA D',
 'ACTA DERMATO VENEREOLOGICA',
 'ACTA DIABETOLOGICA',
 'ACTA F',
 'ACTA NEUROPATHOL',
 'ACTA NEUROPATHOLOGICA',
 'ACTA OPTHALMOLOGICA',
 'ACTA PHYSIOL',
 'ADDICTION',
 'ADVANCES IN EXPERIMENTAL MEDICINE AND BIOLOGY',
 'AGE',
 'AGE AND AGEING',
 'AGEING & SOCIETY',
 'AGING CELL',
 'AIDS',
 'AIDS BEHAV',
 'AIDS CARE',
 'AIDS JOURNAL',
 'AIDS RESEARCH AND THERAPY',
 'AIDS UK',
 'ALCOHOL AND ALCOHOLISM',
 'ALIMENTRARY PHARMACOLOGY & THERAPEUTICS',
 'AM J BIOETH',
 'AM J TROP MED HYG',
 'AMERICAL JOURNAL OF PSYCHIATRY',
 'AMERICAN CHEMICAL SOCIETY',
 'AMERICAN ETHNOLOGIST',
 'AMERICAN JNL EPID

In [267]:
# More cleaning by starting letter
for journal in list(top_20.index):
    print("Name: ", journal + '\n')
    
    by_letter = [x for x in updated_choices if x.startswith(journal[0])]
    list_by_letter = list(enumerate(by_letter))
    for item in list_by_letter:
        print(item)
    
    answer = None
    while answer not in ['y', 'N']:
        answer = input("Does the name in question match any of the items above? (Enter 'y' or 'N'): ")
    index_best_name = None
    indices_other_names = []
    if answer is 'y':
        while index_best_name not in [x[0] for x in list_by_letter]:
            index_best_name = int(input("Which number corresponds to the best name? (Enter a number): "))
        while indices_other_names == []:
            indices_other_names = input("Enter number(s) separated by commas: ")
            indices_other_names = [int(n) for n in indices_other_names.split(',')]
            for num in indices_other_names:
                if num not in [x[0] for x in list_by_letter]:
                    indices_other_names = []
                else:
                    pass
        key = list_by_letter[index_best_name][1]
        vals = [x[1] for x in list_by_letter if x[0] in indices_other_names]
        print(key, vals)
        print('\n')
        wellcome['Journal title'] = wellcome['Journal title'].apply(lambda x, k=key, v=vals: k if x in v else x)

Name:  PLOS ONE

(0, 'PAIN')
(1, 'PARASIT VECTORS.')
(2, 'PARASITE IMMUNOLOGY')
(3, 'PARASITES AND VECTORS')
(4, 'PARASITOLOGY')
(5, 'PARKINSONISM AND RELATED DISORDERS')
(6, 'PARSITOLOGY')
(7, 'PEDIATR INFECT DIS J')
(8, 'PEDIATRICS')
(9, 'PERSONALITY AND INDIVIDUAL DIFFERENCES')
(10, 'PFLUGERS ARCHIV')
(11, 'PFLUGERS ARCHIVE')
(12, 'PHILOS TRANS R SOC LONDON B BIOL SCI')
(13, 'PHILOSOPHICAL TRANSACTIONS B')
(14, 'PHILOSOPHICAL TRANSACTIONS OF THE ROYAL SOCIETY OF LONDON. SERIES B, BIOLOGICAL SCIENCES')
(15, 'PHOTOCHEMICAL & PHOTOBIOLOGICAL SCIENCES')
(16, 'PHYSICS IN BIOLOGY AND MEDICINE')
(17, 'PLANT BIOTECHNOLOGY JOURNAL')
(18, 'PLOS')
(19, 'PLOS BIOLOGY')
(20, 'PLOS COMPUTATIONAL BIOLOGY')
(21, 'PLOS GENETICS')
(22, 'PLOS MEDICINE')
(23, 'PLOS MEDICINE JOURNAL')
(24, 'PLOS NEGLECTED TROPICAL DISEASES')
(25, 'PLOS ONE')
(26, 'PLOS PATHOGENS')
(27, 'PMEDICINE-D-12-03130')
(28, 'PNAS')
(29, 'PNAS USA')
(30, 'PNTD')
(31, 'PONE-D12-17947')
(32, 'POPULATION, SPACE AND PLACE')
(33, 'POST

Which number corresponds to the best name? (Enter a number): 41
Enter number(s) separated by commas: 0,1
JOURNAL OF BIOLOGICAL CHEMISTRY ['J BIOL CHEM.', 'J BIOL CHEMISTRY']


Name:  NEUROIMAGE

(0, 'N BIOTECHNOL.')
(1, 'NAN')
(2, 'NANOTECHNOLOGY')
(3, 'NATIONAL ACADEMY OF SCIENCES')
(4, 'NATURE COMMUNICATIONS')
(5, 'NATURE NEUROSCIENCE')
(6, 'NATURE SCIENTIFIC REPORTS')
(7, 'NEPHROLOGY DIALYSIS AND TRANSPLANTATION')
(8, 'NEPHROLOGY DIALYSIS TRANSPLANTATION')
(9, 'NEPHRON PHYSIOLOGY')
(10, 'NEUROBIOLOGY OF AGING')
(11, 'NEUROBIOLOGY OF DISEASE')
(12, 'NEUROBIOLOGY OF LEARNING AND MEMORY')
(13, 'NEUROCHEMISTRY INTERNATIONAL')
(14, 'NEURODEGENERATIVE DISEASES')
(15, 'NEUROEPIDEMIOLOGY')
(16, 'NEUROGENETICS')
(17, 'NEUROIMAGE')
(18, 'NEUROIMAGE: CLINICAL')
(19, 'NEUROINFORMATICS')
(20, 'NEUROLOGY')
(21, 'NEUROMOLECULAR MEDICINE')
(22, 'NEUROMUSCULAR DISORDERS')
(23, 'NEURON')
(24, 'NEUROPATHOLOGY AND APPLIED NEUROBIOLOGY')
(25, 'NEUROPHARMACOLOGY')
(26, 'NEUROPHYSIOLOGIA')
(27, 'NEUROPSYC

Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  PLOS GENETICS

(0, 'PAIN')
(1, 'PARASIT VECTORS.')
(2, 'PARASITE IMMUNOLOGY')
(3, 'PARASITES AND VECTORS')
(4, 'PARASITOLOGY')
(5, 'PARKINSONISM AND RELATED DISORDERS')
(6, 'PARSITOLOGY')
(7, 'PEDIATR INFECT DIS J')
(8, 'PEDIATRICS')
(9, 'PERSONALITY AND INDIVIDUAL DIFFERENCES')
(10, 'PFLUGERS ARCHIV')
(11, 'PFLUGERS ARCHIVE')
(12, 'PHILOS TRANS R SOC LONDON B BIOL SCI')
(13, 'PHILOSOPHICAL TRANSACTIONS B')
(14, 'PHILOSOPHICAL TRANSACTIONS OF THE ROYAL SOCIETY OF LONDON. SERIES B, BIOLOGICAL SCIENCES')
(15, 'PHOTOCHEMICAL & PHOTOBIOLOGICAL SCIENCES')
(16, 'PHYSICS IN BIOLOGY AND MEDICINE')
(17, 'PLANT BIOTECHNOLOGY JOURNAL')
(18, 'PLOS')
(19, 'PLOS BIOLOGY')
(20, 'PLOS COMPUTATIONAL BIOLOGY')
(21, 'PLOS GENETICS')
(22, 'PLOS MEDICINE')
(23, 'PLOS MEDICINE JOURNAL')
(24, 'PLOS NEGLECTED TROPICAL DISEASES')
(25, 'PLOS ONE')
(26, 'PLOS PATHOGENS')
(27, 'PMEDICINE-D-12-03130')
(28, 'PNAS')
(29, 'PNAS USA')

Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  MOVEMENT DISORDERS

(0, 'MAGNETIC RESONANCE IN MEDICINE')
(1, 'MALARIA JOURNAL')
(2, 'MAMMALIAN GENOME')
(3, 'MARTEN CHILD NUTR')
(4, 'MATERNAL AND CHILD NUTRITION')
(5, 'MATH BIOSCI')
(6, 'MATRIX BIOLOGY')
(7, 'MATURITAS')
(8, 'MBIO')
(9, 'MCP (MOLECULAR & CELLULAR PROTEOMICS)')
(10, 'MECHANISMS OF AGEING AND DEVELOPMENT')
(11, 'MEDICAL ANTHROPOLOGY')
(12, 'MEDICAL HISTORY')
(13, 'MEDICAL HUMANITIES')
(14, 'MEDICAL LAW REVIEW')
(15, 'METABOLOMICS')
(16, 'METHODS IN MOLECULAR BIOLOGY')
(17, 'MICROBES AND INFECTION')
(18, 'MICROBES INFECT')
(19, 'MICROBIOLOGY')
(20, 'MOL BIO')
(21, 'MOL BIOL AND EVOLUTION')
(22, 'MOL PHARM')
(23, 'MOLECLUAR & CELLULAR ENDOCRINOLOGY')
(24, 'MOLECULAR & BIOCHEMICAL PARASITOLOGY')
(25, 'MOLECULAR & CELLULAR PROTEOMICS')
(26, 'MOLECULAR AND CELLULAR BIOCHEMISTRY')
(27, 'MOLECULAR AND CELLULAR BIOLOGY')
(28, 'MOLECULAR AND CELLULAR ENDOCRINOLOGY')
(29, 'MOLECULAR AND CELLULA

Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  DEVELOPMENTAL CELL

(0, 'DALTON TRANSACTIONS')
(1, 'DATABASE')
(2, 'DEPRESSION AND ANXIETY')
(3, 'DERMATOLOGIC SURGERY')
(4, 'DEV WORLD BIOETH.')
(5, 'DEV. WORLD BIOETH')
(6, 'DEVELOPING WORLD BIOETHICS')
(7, 'DEVELOPMENT')
(8, 'DEVELOPMENT DYNAMICS')
(9, 'DEVELOPMENT GENES AND EVOLUTION')
(10, 'DEVELOPMENT SCIENCE')
(11, 'DEVELOPMENTAL BIOLOGY')
(12, 'DEVELOPMENTAL CELL')
(13, 'DEVELOPMENTAL COGNITIVE NEUROSCIENCE')
(14, 'DEVELOPMENTAL SCIENCE')
(15, 'DIABETIC MEDICINE')
(16, 'DIABETOLOGIA')
(17, 'DIAGNOSTIC MICROBIOLOGY AND INFECTIOUS DISEASE')
(18, 'DISEASE MODELS AND MECHANISMS')
(19, 'DRUG AND ALCOHOL REVIEW')
Does the name in question match any of the items above? (Enter 'y' or 'N'): N
Name:  JOURNAL OF GENERAL VIROLOGY

(0, 'J BIOL CHEM.')
(1, 'J BIOL CHEMISTRY')
(2, 'J CARDIOVASC MAGN RESON')
(3, 'J CELL SCI.')
(4, 'J CLIN MICROBIOL')
(5, 'J IMMUNOL')
(6, 'J INFECT DIS')
(7, 'J MED CHEM')
(8, '

Does the name in question match any of the items above? (Enter 'y' or 'N'): y
Which number corresponds to the best name? (Enter a number): 68
Enter number(s) separated by commas: 65
CURRENT BIOLOGY ['CURR BIOL.']


Name:  MALARIA JOURNAL

(0, 'MAGNETIC RESONANCE IN MEDICINE')
(1, 'MALARIA JOURNAL')
(2, 'MAMMALIAN GENOME')
(3, 'MARTEN CHILD NUTR')
(4, 'MATERNAL AND CHILD NUTRITION')
(5, 'MATH BIOSCI')
(6, 'MATRIX BIOLOGY')
(7, 'MATURITAS')
(8, 'MBIO')
(9, 'MCP (MOLECULAR & CELLULAR PROTEOMICS)')
(10, 'MECHANISMS OF AGEING AND DEVELOPMENT')
(11, 'MEDICAL ANTHROPOLOGY')
(12, 'MEDICAL HISTORY')
(13, 'MEDICAL HUMANITIES')
(14, 'MEDICAL LAW REVIEW')
(15, 'METABOLOMICS')
(16, 'METHODS IN MOLECULAR BIOLOGY')
(17, 'MICROBES AND INFECTION')
(18, 'MICROBES INFECT')
(19, 'MICROBIOLOGY')
(20, 'MOL BIO')
(21, 'MOL BIOL AND EVOLUTION')
(22, 'MOL PHARM')
(23, 'MOLECLUAR & CELLULAR ENDOCRINOLOGY')
(24, 'MOLECULAR & BIOCHEMICAL PARASITOLOGY')
(25, 'MOLECULAR & CELLULAR PROTEOMICS')
(26, 'MOLECULAR AND C

### 1. Determine the five most common journals and the total articles for each.
### _MY RESULTS AFTER CLEANING_

In [326]:
# Top 5 Journals and the total articles for each
top_20 = wellcome['Journal title'].value_counts().head(20)
top_20.head(5)

PLOS ONE                                           208
JOURNAL OF BIOLOGICAL CHEMISTRY                     71
PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES     37
NEUROIMAGE                                          31
NUCLEIC ACIDS RESEARCH                              29
Name: Journal title, dtype: int64

### 2. Calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [327]:
# Calculate mean, median, and standard deviation for article costs for each journal
costs = []
for name in list(top_20.head(5).index):
    # Build journal DataFrames
    df_journal = wellcome[wellcome['Journal title'] == name]
    # Build journal series
    series_journal_cost = df_journal['COST (£) charged to Wellcome (inc VAT when charged)']
    
    # Remove the '£' symbol from each record in the cost column
    sjc_floats = series_journal_cost.apply(lambda x: float(x[1:]) if x.startswith('£') else float(x))
    # Remove cost outliers >= £150,000
    sjc_no_outliers = sjc_floats.apply(lambda x: x if x < 150000.00 else None).dropna()
    costs.append(sjc_no_outliers)
    
    print('Journal Name: ', name)
    print('Mean:   ', np.mean(sjc_no_outliers))
    print('Median: ', np.median(sjc_no_outliers))
    print('Std Dev:', np.std(sjc_no_outliers))
    print()

Journal Name:  PLOS ONE
Mean:    935.5772361809031
Median:  896.99
Std Dev: 194.16468771037597

Journal Name:  JOURNAL OF BIOLOGICAL CHEMISTRY
Mean:    1384.2218840579712
Median:  1314.53
Std Dev: 389.20099406785545

Journal Name:  PROCEEDINGS OF THE NATIONAL ACADEMY OF SCIENCES
Mean:    772.9191666666668
Median:  732.01
Std Dev: 365.0141675208953

Journal Name:  NEUROIMAGE
Mean:    2212.1812903225805
Median:  2326.43
Std Dev: 268.7507817091939

Journal Name:  NUCLEIC ACIDS RESEARCH
Mean:    1162.344827586207
Median:  852.0
Std Dev: 434.4607657269006

