# Data Cleaning

```
Dustin Michels
November 2017
```

This module:

* Loads metadata, GO data, and taxonomy data CSVs into dataframes
* Cleans up and merges together the dataframes
* Provides some functions for further processing

It is primarily intended to be called from another notebook using the IPython magic, `%run`

In [1]:
import pandas as pd

In [2]:
"""
Define some useful vars
"""

# To help output vars in namespace at end of file
start_vars = dir()

# paths to other stuff
data_path = '../../data/'
go_data_path = data_path + 'go_downloads/'
tax_data_path = data_path + 'taxonomy'
img_path = '../imgs/'

meta_headers = [
    'depth_(m)', 'temp_(c)', 'chlorophyl_(mg_chl/m3)',
    'nitrate_(µmol/l)', 'oxygen_(µmol/kg)', 'salinity_(psu)',
    'lat', 'long']

samples = [
    'ERR599104','ERR599090','ERR599008','ERR598948',
    'ERR598992','ERR598999','ERR598995','ERR598980',
    'ERR599142','ERR599078','ERR599031']

region_map = {
    'Southern Ocean (near Antarctica)':'SO',
    'South Pacific (near the Marquesas)':'SP',
    'North Pacific':'NP',
    'North Atlantic (off the coast of Portugal)':'NA',
    'Arabian Sea':'AS'}

zone_map = {
    'deep chlorophyll maximum layer':'DCM',
    'surface water layer':'SURF',
    'mesopelagic zone':'MESO'}

# To help output vars in namespace at end of file
global_vars = [x for x in dir() if x not in start_vars]

## Define Functions for Reading Data

### Get / Clean Meta Data

In [3]:
def get_meta_df():
    
    # Get and clean project meta data
    meta_df = pd.read_csv(data_path + "project_metadata_functional.csv")

    # Make column names neater
    meta_df.columns = meta_df.columns.str.strip()
    meta_df.columns = meta_df.columns.str.lower()
    meta_df.columns = meta_df.columns.str.replace(' ', '_')
    meta_df.rename(columns={'sample_details':'zone'}, inplace=True)

    # Split lat and long into seperate columns
    meta_df['lat'] = meta_df['lat/long'].str.split(',', 1).str[0]
    meta_df['long'] = meta_df['lat/long'].str.split(',', 1).str[1]

    # Clean up 'region' and 'zone' columns
    meta_df['region'] = meta_df['region'].str.strip()
    meta_df['zone'] = meta_df['zone'].str.strip()

    # Indicate categorical data
    meta_df['region'] = meta_df['region'].astype('category')
    meta_df['zone'] = meta_df['zone'].astype('category')
    meta_df['run_id'] = meta_df['run_id'].astype('category')

    # Drop a few categories
    meta_df.drop(
        ['downloaded','link_to_info', 'student', 'lat/long'],
        axis=1, inplace=True)
    
    return meta_df

### Get / Clean GO Data

In [4]:
def get_df_helper(idx):
    """Helper function for parsing GO CSVs"""
    
    filenames = meta_df['filename']
    names = ['go_id', 'name', 'namespace', 'read_count']

    # Read GO csv (for given index)
    filepath = f"{go_data_path}{filenames[idx]}"
    df = pd.read_csv(
        filepath, header=None, names=names)
    
    # Add run_id column
    df.insert(0, 'run_id', meta_df['run_id'][idx])
    
    # Sort by read_count
    df.sort_values('read_count', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Add read_percent column, based on read_count
    read_sum = df['read_count'].sum()
    df['read_percent'] = (df['read_count']/read_sum)
    
    # Drop some columns
    df.drop(['go_id','read_count'], axis=1, inplace=True)
    
    return df


def get_GO_df():
    """Call helper function with all GO annotations,
    concatenating resulting dataframes together
    """
    
    df = get_df_helper(0)
    for i in range(1, len(meta_df)):
        new_df = get_df_helper(i)
        df = pd.concat([df, new_df])
    df.reset_index(drop=True, inplace=True)

    # Indicate categorical data
    df['run_id'] = df['run_id'].astype('category')
    df['namespace'] = df['namespace'].astype('category')
    
    return df

### Get / Clean Taxonomy Data

In [5]:
def get_tax_df():
    """Read tax_summary.csv and into pandas
    and clean data into tiday format"""
    
    mapping_dict = {
        '00_SrnOcean_DCM':'ERR599104',
        '01_SrnOcean_SURF':'ERR599090',
        '02_SrnOcean_MESO':'ERR599008',
        '03_SPac_DCM':'ERR598948',
        '04_SPac_SURF':'ERR598992',
        '05_SPac_MESO':'ERR598999',
        '06_NPac_DCM':'ERR598995',
        '07_NPac_MESO':'ERR598980',
        '08_NPac_SURF':'ERR599142',
        '09_NAtl_SURF':'ERR599078',
        '10_AraSea_MESO':'ERR599031'
    }
    samples = list(mapping_dict.keys())
    run_id = list(mapping_dict.values())

    tax_df = pd.read_csv(f"{tax_data_path}/tax_summary.csv")
    tax_df = tax_df[tax_df['taxlevel']==2]

    tax_df.drop(
        ['taxlevel', 'daughterlevels', 'rankID','total'],
        axis=1, inplace=True)
    
    # Convert counts to percents
    for samp in samples:
        tax_df[samp] = tax_df[samp] / tax_df[samp].sum()

    # Fix names
    cols = ['name'] + run_id
    tax_df.columns = cols
    
    # Restructure
    tax_df = tax_df.melt(
        id_vars='name', var_name='run_id',
        value_name='tax_percent')
    
    # Rearrange columns
    tax_df = tax_df[['name', 'run_id', 'tax_percent']]
        
    return tax_df


## Merge Data with Metadata

In [6]:
def merge_data(meta_df, data_df):
    """Merge data_df (either GO or tax)
    and meta_df together, using run_id"""
    
    full_df = data_df.merge(meta_df, on='run_id')
    full_df.drop('filename', axis=1, inplace=True)
    
    return full_df

## Get DF with Top-N Most Abundant

In [7]:
def get_top_n(full_df, by_col='read_percent', n=25):
    """Get superset of n most abundant rows from full_df
    shared by all samples.
    
    Eg., 25 most abundant groups in GO_df (by 'read_percent')
    for each run_id"""
    
    # Grab 25 largest, by taxonomy percent, for each group
    s = full_df.groupby('run_id')[by_col].nlargest(n)
    
    # Query full dataframe for matching indicies
    top_df = full_df.iloc[s.index.droplevel(0)]
    
    # To get superset, query full_df again with those names
    names = top_df['name'].unique()
    df = full_df[full_df['name'].isin(names)]
    
    # Make a deep copy (instead of using a slice of full_df)
    df = df.copy(deep=True)
    
    return df

## Further Processing

In [8]:
def rename_shorter(df):
    """Rename region, zone, and run_id to shorter names,
    based on maps at top of file"""
    
    df['region'] = df['region'].map(region_map)
    df['zone'] = df['zone'].map(zone_map)
    df['run_id'] = df['region'] + " (" + df['zone'] + ")"

    return df

In [9]:
def truncate_names(df, col="name", n=35):
    """Truncate strings in give col at n characters"""
    
    df[col] = df[col].apply(
        lambda x: (x[:35] + '...') if len(x) > 38 else x)
    
    return df

## Run the Code!

In [10]:
# To help output vars in namespace at end of file
start_vars = dir()

meta_df = get_meta_df()
go_df = get_GO_df()
tax_df = get_tax_df()

full_go = merge_data(meta_df, go_df)
full_tax = merge_data(meta_df, tax_df)

# To help output vars in namespace at end of file
df_vars = [x for x in dir() if x not in start_vars]

## Print Info

Helperful when this notebook is '%run' by a different notebook

In [11]:
# import types
# def load_data_info():
#     print("Added these variables to the namespace:")
#     print(global_vars)

#     print("\nAs well as these data frames:")
#     print(df_vars)

#     print("\nAnd these functions:")
#     print([f.__name__ for f in globals().values() if type(f) == types.FunctionType])