# Data Cleaning

Load metadata and GO data, clean, and merge together

```
Dustin Michels
November 2017
```

## Reading Data

In [1]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
from pandas.plotting import parallel_coordinates

In [2]:
data_path = '../../data/'
go_data_path = data_path + 'go_downloads/'
img_path = '../imgs/'

### Read Metadata CSV

In [3]:
# Get and clean project meta data
meta_df = pd.read_csv(data_path + "project_metadata_functional.csv")

# Make column names neater
meta_df.columns = meta_df.columns.str.strip()
meta_df.columns = meta_df.columns.str.lower()
meta_df.columns = meta_df.columns.str.replace(' ', '_')
meta_df.rename(columns={'sample_details':'zone'}, inplace=True)

# Split lat and long into seperate columns
meta_df['lat'] = meta_df['lat/long'].str.split(',', 1).str[0]
meta_df['long'] = meta_df['lat/long'].str.split(',', 1).str[1]

# Clean up 'region' and 'zone' columns
meta_df['region'] = meta_df['region'].str.strip()
meta_df['zone'] = meta_df['zone'].str.strip()

# Indicate categorical data
meta_df['region'] = meta_df['region'].astype('category')
meta_df['zone'] = meta_df['zone'].astype('category')
meta_df['run_id'] = meta_df['run_id'].astype('category')

# Drop a few categories
meta_df.drop(
    ['downloaded','link_to_info', 'student', 'lat/long'],
    axis=1, inplace=True)

# Display first few items
display("Metadata (Head):", meta_df.head())

'Metadata (Head):'

Unnamed: 0,region,run_id,filename,zone,depth_(m),temp_(c),chlorophyl_(mg_chl/m3),nitrate_(µmol/l),oxygen_(µmol/kg),salinity_(psu),lat,long
0,Southern Ocean (near Antarctica),ERR599104,ERR599104_MERGED_FASTQ_GO.csv,deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139
1,Southern Ocean (near Antarctica),ERR599090,ERR599090_MERGED_FASTQ_GO.csv,surface water layer,5,0.67108,0.065273,27.501159,343.4373,34.35388,-62.0385,-49.529
2,Southern Ocean (near Antarctica),ERR599008,ERR599008_MERGED_FASTQ_GO.csv,mesopelagic zone,790,0.45883,0.010087,38.821249,203.8394,34.67996,-61.9689,-49.5017
3,South Pacific (near the Marquesas),ERR598948,ERR598948_MERGED_FASTQ_GO.csv,deep chlorophyll maximum layer,115,24.69625,0.295808,2.333576,179.916875,36.097575,-9.0063,-139.1394
4,South Pacific (near the Marquesas),ERR598992,ERR598992_MERGED_FASTQ_GO.csv,surface water layer,5,26.54413,0.166512,3.986359,186.2407,35.3662,-8.9971,-139.1963


### Read GO Annotation CSVs

In [4]:
def get_df(idx):
    """Helper function for parsing CSVs"""
    
    filenames = meta_df['filename']
    names = ['go_id', 'name', 'namespace', 'read_count']

    # Read GO csv (for given index)
    filepath = f"{go_data_path}{filenames[idx]}"
    df = pd.read_csv(
        filepath, header=None, names=names)
    
    # Add run_id column
    df.insert(0, 'run_id', meta_df['run_id'][idx])
    
    # Sort by read_count
    df.sort_values('read_count', ascending=False, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # Add read_percent column, based on read_count
    read_sum = df['read_count'].sum()
    df['read_percent'] = (df['read_count']/read_sum)
    
    # Drop some columns
    df.drop(['go_id','read_count'], axis=1, inplace=True)
    
    return df

# Call helper function with all GO annotations,
# concatenating resulting dataframes together
df = get_df(0)
for i in range(1, len(meta_df)):
    new_df = get_df(i)
    df = pd.concat([df, new_df])
df.reset_index(drop=True, inplace=True)
    
# Indicate categorical data
df['run_id'] = df['run_id'].astype('category')
df['namespace'] = df['namespace'].astype('category')

# Display
display("GO Annotations (head & tail):", df.head(), df.tail())

'GO Annotations (head & tail):'

Unnamed: 0,run_id,name,namespace,read_percent
0,ERR599104,catalytic activity,molecular_function,0.066795
1,ERR599104,oxidation-reduction process,biological_process,0.058572
2,ERR599104,ATP binding,molecular_function,0.054929
3,ERR599104,metabolic process,biological_process,0.040067
4,ERR599104,membrane,cellular_component,0.031069


Unnamed: 0,run_id,name,namespace,read_percent
24086,ERR599031,vesicle docking involved in exocytosis,biological_process,6.473864e-08
24087,ERR599031,mitochondrial pyruvate transport,biological_process,6.473864e-08
24088,ERR599031,galactosylceramide catabolic process,biological_process,6.473864e-08
24089,ERR599031,glycerol ether metabolic process,biological_process,6.473864e-08
24090,ERR599031,cutinase activity,molecular_function,6.473864e-08


## Merge the Two DataFrames Together

In [5]:
# Merge, using run_id
full_df = df.merge(meta_df)
full_df.drop('filename', axis=1, inplace=True)

display('Full DataFrame (head & tail):', full_df.head(), full_df.tail())

'Full DataFrame (head & tail):'

Unnamed: 0,run_id,name,namespace,read_percent,region,zone,depth_(m),temp_(c),chlorophyl_(mg_chl/m3),nitrate_(µmol/l),oxygen_(µmol/kg),salinity_(psu),lat,long
0,ERR599104,catalytic activity,molecular_function,0.066795,Southern Ocean (near Antarctica),deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139
1,ERR599104,oxidation-reduction process,biological_process,0.058572,Southern Ocean (near Antarctica),deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139
2,ERR599104,ATP binding,molecular_function,0.054929,Southern Ocean (near Antarctica),deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139
3,ERR599104,metabolic process,biological_process,0.040067,Southern Ocean (near Antarctica),deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139
4,ERR599104,membrane,cellular_component,0.031069,Southern Ocean (near Antarctica),deep chlorophyll maximum layer,90,-0.78154,0.540091,33.109231,325.402987,34.319478,-62.2231,-49.2139


Unnamed: 0,run_id,name,namespace,read_percent,region,zone,depth_(m),temp_(c),chlorophyl_(mg_chl/m3),nitrate_(µmol/l),oxygen_(µmol/kg),salinity_(psu),lat,long
24086,ERR599031,vesicle docking involved in exocytosis,biological_process,6.473864e-08,Arabian Sea,mesopelagic zone,600,12.066048,0.005649,31.802451,1.63999,35.693546,20.8457,63.5851
24087,ERR599031,mitochondrial pyruvate transport,biological_process,6.473864e-08,Arabian Sea,mesopelagic zone,600,12.066048,0.005649,31.802451,1.63999,35.693546,20.8457,63.5851
24088,ERR599031,galactosylceramide catabolic process,biological_process,6.473864e-08,Arabian Sea,mesopelagic zone,600,12.066048,0.005649,31.802451,1.63999,35.693546,20.8457,63.5851
24089,ERR599031,glycerol ether metabolic process,biological_process,6.473864e-08,Arabian Sea,mesopelagic zone,600,12.066048,0.005649,31.802451,1.63999,35.693546,20.8457,63.5851
24090,ERR599031,cutinase activity,molecular_function,6.473864e-08,Arabian Sea,mesopelagic zone,600,12.066048,0.005649,31.802451,1.63999,35.693546,20.8457,63.5851


## Make Top-25 dataframe

In [6]:
## Get superset of 25 most abundant groups shared
## by all samples. This becomes 29 functional groups.

grouped_df = full_df.groupby('run_id').head(n=25)
names = grouped_df.name.unique()

# Select entries in full_df where name is one of the 29
top_df = full_df[full_df['name'].isin(names)]

# Make a deep copy (instead of using a slice of full_df)
top_df = top_df.copy(deep=True)

In [7]:
# Truncate function names at 35 characters
top_df['name'] = top_df['name'].apply(
    lambda x: (x[:35] + '...') if len(x) > 38 else x)

In [8]:
len(top_df)

319

In [9]:
len(full_df)

24091