# Preparing sample data for model 

In this notebook, I prepare sample data which is a placeholder in the "model" notebooks.

In [11]:
#standard 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

#model to use
from sklearn.ensemble import RandomForestClassifier

#model performance
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold, RandomizedSearchCV, train_test_split
from scipy.stats import randint as sp_randint
from time import time
#for easy visualization of model performance
import scikitplot as skplt

#importing model
import pickle 

#custom function for trim mean and std deviation
def trim_mean_std(data, frac=0.05):    
    mean = stats.trim_mean(data, frac)
    std = stats.tstd(data, limits=(frac * np.max(data), (1 - frac) * np.max(data)))
    return mean, std

#Dictionary to convert amino acid abreviations
aa_dict = {'ALA': 'A', 'CYS':'C', 'ASP': 'D', 'GLU':'E',
          'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
          'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N',
          'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T',
          'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}


#Dictionary to bucket amino acids
condensed_dict = {'L': 'LAIVMH', 'A': 'LAIVMH', 'I': 'LAIVMH', 'V': 'LAIVMH', 'M': 'LAIVMH', 'H': 'LAIVMH',
                  'D': 'DEQ', 'E': 'DEQ', 'Q': 'DEQ', 
                  'S': 'STGRFN', 'T': 'STGRFN', 'G': 'STGRFN', 'R': 'STGRFN', 'F': 'STGRFN', 'N': 'STGRFN',
                  'K': 'K'}

In [None]:
#normal z score function
def z_score(data):
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)
    zeroed_data = np.subtract(data, means)
    scaled_data = np.divide(zeroed_data, stds)
    return scaled_data 

#trim z score function
def trim_log_z_score(data, frac=0.05):
    '''
    Args:
    data (NUMPY ARRAY!!): numpy array of data to be log transformed and scaled, not robust to negative numbers
    frac (float): how much to trim mean and standard deviation
    Returns:
    scaled_data (numpy array): log transformed and scaled data
    '''
    log_data = np.log(np.clip(data, 0.1, None))
    trim_means_and_std = np.apply_along_axis(func1d=trim_mean_std, axis=0, arr=log_data, frac=frac)
    means = trim_means_and_std[0,:].reshape(1,-1)
    stds = trim_means_and_std[1,:].reshape(1,-1)
    zeroed_data = np.subtract(log_data, means)
    scaled_data = np.divide(zeroed_data, stds)
    return scaled_data

def ft_eng_and_scale(data):
    nums = ['0%', '1%', '5%', '10%']
    ratio_cols = ['1/0', '5/0', '10/0', '5/1', '10/1', '10/5']
    ratios_df = create_ratios(data)
    log_z_signal = trim_log_z_score(data[nums].values)
    z_ratio = z_score(ratios_df.values)
    X = np.concatenate((log_z_signal, z_ratio), axis=1)
    X_df = pd.DataFrame(X, columns=(nums+ratio_cols))
    X_df['AA'] = data['AA']
    return X_df

In [7]:
#importing data
IGPS = pd.read_csv('data/IGPS_8hr_0513.csv').dropna(axis=0)
PTP1B = pd.read_csv('data/PTP1B_8hr_0513.csv').dropna(axis=0)

#rearranging columns and setting AA colum to single letter abbreviation
PTP1B = PTP1B[['0%', '1%', '5%', '10%', 'AA']]
PTP1B[['AA']] = PTP1B[['AA']].apply(lambda x: x.str[0])
PTP1B = PTP1B[PTP1B['AA'] != 'X']

IGPS = IGPS[['0%', '1%', '5%', '10%', 'AA']]
IGPS[['AA']] = IGPS['AA'].map(aa_dict)

#converting labels to categorical variables
IGPS.AA = IGPS.AA.astype('category')
PTP1B.AA = PTP1B.AA.astype('category')

#combining data
both_proteins = pd.concat([IGPS, PTP1B] , axis=0, ignore_index=True)

#label protein in overall array
proteins = ['IGPS'] * IGPS.shape[0] + ['PTP1B'] * (both_proteins.shape[0] - IGPS.shape[0])
both_proteins['protein'] = pd.Series(proteins)

#target amino acids
y = both_proteins.AA

In [1]:
def create_filter(AA_excluded, y, reverse=False):
    all_labels = ['L','A', 'I', 'V', 'D', 'E', 'Q', 'S', 'T', 'F', 'G', 'H', 'R', 'K', 'C', 'N', 'M', 'W', 'Y' ]
    labels = [x for x in all_labels if x not in AA_excluded]
    my_filter = [True if aa not in AA_excluded else False for aa in y]
    if reverse == True:
        reverse_filter = [True if aa in AA_excluded else False for aa in y]
        return my_filter, labels, reverse_filter
    else:
        return my_filter, labels

In [9]:
AA_excluded = ['W', 'Y', 'C']
my_filter, labels, reverse_filter = create_filter(AA_excluded, y, reverse=True)

In [14]:
nums = ['0%', '1%', '5%', '10%']
both_proteins_arr = both_proteins[nums].values
X_train, X_test, y_train, y_test = train_test_split(
        both_proteins_arr[my_filter,:], y[my_filter], test_size=0.1, random_state=0)

In [17]:
sample_labeled= pd.DataFrame(X_train)

In [20]:
sample_labeled['AA'] = y_train

In [49]:
sample_unlabeled = IGPS.sample(frac=0.1, replace=False, random_state=0)

In [50]:
short_ind = sample_unlabeled.index

In [51]:
total_ind = IGPS.index
sample_labeled = IGPS.loc[set(total_ind)-set(short_ind)]

In [58]:
sample_unlabeled.drop(columns='AA').to_csv('data/sample_unlabeled.csv', index=False)

In [57]:
sample_labeled.to_csv('data/sample_labeled.csv', index=False)

In [55]:
sample_unlabeled.head()

Unnamed: 0,0%,1%,5%,10%,AA
10,44878.0,35399.39127,28197.90367,27808.87681,D
53,45768.0,31829.72942,14369.01023,18841.6909,H
45,80528.0,59971.64002,50153.13583,33555.46352,G
26,43642.0,36039.34709,27277.08781,27101.01449,E
67,98323.0,63613.22351,9940.21114,12906.80585,K
