# Challenge: Data cleaning & validation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('WELLCOME_APCspend2013_forThinkful.csv', encoding='latin-1')
df.head(n=3)

Unnamed: 0,PMID/PMCID,Publisher,Journal title,Article title,COST (£) charged to Wellcome (inc VAT when charged)
0,,CUP,Psychological Medicine,Reduced parahippocampal cortical thickness in ...,£0.00
1,PMC3679557,ACS,Biomacromolecules,Structural characterization of a Model Gram-ne...,£2381.04
2,23043264 PMC3506128,ACS,J Med Chem,"Fumaroylamino-4,5-epoxymorphinans and related ...",£642.56


## Clean up the dataset

In [3]:
# Fill empty cells:
df = df.fillna('n_a')

# Strip whitespace from beggining and end of strings in the entire dataframe:
df = df.apply(lambda x: x.str.strip()if x.dtype == "object" else x)

# Rename the Cost column:
df.rename(columns={
    'COST (£) charged to Wellcome (inc VAT when charged)':'Cost'}, inplace=True)

# Remove the currency symbol:
df['Cost'] = df['Cost'].str.replace('£', '')

# Capitalize Journal Title Columns:
df['Journal title'] = df['Journal title'].str.title()

## Renaming five journal titles:

#### 1. Renaming with a lambda function:

In [4]:
df['Journal title'] = df['Journal title'].apply(
    lambda x: 'Trends In Neurosciences'
    if 'Trends In Neuroscience' in x else x)

df['Journal title'] = df['Journal title'].apply(
    lambda x: 'Tropical Medicine And International Health'
    if 'Trop Med Int Health' in x else x)

#### 2. Renaming with a lambda function including two separate terms:

In [5]:
# Preview a faulty journal name:
df['Journal title'].loc[df['Journal title'].str.contains(
    'Acta Crystal', case=False)].unique()

array(['Acta Crystallographica Section D: Biological Crystallography',
       'Acta Crystallographica, Section D', 'Acta Crystallography D',
       'Acta Crystallographica Section D,  Biological Crystallography',
       'Acta Crystallographica Section F: Structural Biology And Crystallization Communications'], dtype=object)

In [6]:
# Fix the journal name filtering from two separate terms:
df['Journal title'] = df['Journal title'].apply(
    lambda x: 'Acta Crystallographica Section D: Biological Crystallography'
    if 'Acta Crystallograph' and 'Section D' in x else x)

# Double-checking the result:
df['Journal title'].loc[df['Journal title'].str.contains(
    'Acta Crystal', case=False)].unique()

array(['Acta Crystallographica Section D: Biological Crystallography',
       'Acta Crystallography D',
       'Acta Crystallographica Section F: Structural Biology And Crystallization Communications'], dtype=object)

In [7]:
# Iterating to finish:
df['Journal title'] = df['Journal title'].apply(
    lambda x: 'Acta Crystallographica Section D: Biological Crystallography'
    if 'Acta Crystallography D' in x else x)

# End result:
df['Journal title'].loc[df['Journal title'].str.contains(
    'Acta Crystal', case=False)].unique()

array(['Acta Crystallographica Section D: Biological Crystallography',
       'Acta Crystallographica Section F: Structural Biology And Crystallization Communications'], dtype=object)

#### 3. Renaming with Pandas .str.contains:

In [8]:
# Previewing a faulty journal name:
df['Journal title'][df['Journal title'].str.contains('Neurophys', case=False)].unique()

array(['Journal Of Neurophysiology', 'The Journal Of Neurophysiology',
       'Neurophysiologia'], dtype=object)

In [9]:
# Replacing the faulty journal name:
df[df['Journal title'].str.contains(
    'Journal Of Neurophys', case=False)] = 'Journal Of Neurophysiology'

In [10]:
# Previewing a faulty string:
df['Journal title'][df['Journal title'].str.contains('Neurophys', case=False)].unique()

array(['Journal Of Neurophysiology', 'Neurophysiologia'], dtype=object)

## Determine the five most common journals and the total articles for each.

In [11]:
# Create a list with the names of the top five journals:
top_journals = df[['Journal title','Publisher']].groupby(
    ['Journal title']).count().sort_values('Publisher',ascending=False)
top_journals = top_journals.head(n=5)
top_journals = list(top_journals.index.values)

# Filter rows with values for the top 5 journals:
article_count = df[df['Journal title'].isin(top_journals)]

# Count values by Journal title:
article_count['Journal title'].value_counts()

Plos One                           190
Journal Of Biological Chemistry     53
Neuroimage                          29
Nucleic Acids Research              26
Plos Genetics                       24
Name: Journal title, dtype: int64

## Calculate the mean, median, and standard deviation of the open-access cost per article for each journal.

In [12]:
# Convert Cost column to integer. Convert to NAN any values that aren't numbers:
df['Cost'] = pd.to_numeric(df['Cost'], errors='coerce')

# Replace values that are above or below 4x the median and
# create a function that can be useful in future projects:
def replace_outliers(col,factor):
    try:
        median_factor = col.median() * factor
    except TypeError:
        print('Column values must be numeric')
    else:
        new_col = col.apply(
            lambda x: median_factor if (x > median_factor) or x < (median_factor*-1)
            else x)
        return new_col

df['Cost_minus_outliers'] = replace_outliers(df['Cost'],4)

In [13]:
df.describe()

Unnamed: 0,Cost,Cost_minus_outliers
count,2107.0,2107.0
mean,24282.213702,1962.37412
std,147539.712409,1160.131607
min,0.0,0.0
25%,1280.0,1280.0
50%,1894.6,1894.6
75%,2323.03,2323.03
max,999999.0,7578.4


## For a real bonus round, identify the open access prices paid by subject area.

In [14]:
# Create a "Subject column" with the titles of all journals:
df['Subject area'] = df['Journal title'].str.lower()

In [15]:
# List of terms to be deleted from the Subject Area column:
replace_values = {
    'journal':'', 'j ':'', ' j':'', 'of':'', 'acs':'', 'society':'', 'reviews':'',
    'traffic':'', 'uk':'', 'american':'', 'plos one':'', 'plos':'', '\n':'',
    'trends in ':'', '\s\d':'', '\s\D$':'', 'acta':'', '&':''}

df.replace({"Subject area": replace_values}, regex=True, inplace=True)

# Fill empty strings with a standard message:
df['Subject area'].replace('', 'Journal title with no subject area name', inplace=True)

# Strip whitespaces once again:
df = df.apply(lambda x: x.str.strip()if x.dtype == "object" else x)

# List the average open access prices paid by subject area:
df[['Subject area','Cost_minus_outliers']].groupby('Subject area').mean()

Unnamed: 0_level_0,Cost_minus_outliers
Subject area,Unnamed: 1_level_1
Journal title with no subject area name,1226.689665
abnormal psychology,2534.530000
academy nutrition and dietetics,2379.540000
acquired immune deficiency syndromes,2034.750000
acquired immune deficiency syndroms (jaids),1836.920000
addiction,2136.225000
advances in experimental medicine and biology,1928.457500
affective disorders,2123.720000
age,2170.140000
age and ageing,2040.000000
