# Resampling (grouped_data.csv)

In [1]:
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
input_path = '../data/grouped_data.csv'
# input_path = '../data/grouped_data1.csv'
# input_path = '../data/grouped_data2.csv'

In [5]:
# removing this part as we will be using the whole dataset for training
# # split dataset into X y train test, based on gene_id
# # input: df, split_size
# # output: train df, test df
# def split(df, split_size=0.2):
#     splitter = GroupShuffleSplit(test_size=split_size, n_splits=1, random_state=42)
#     split = splitter.split(df, groups=df['gene_id'])
#     train_inds, test_inds = next(split)
#     train = df.iloc[train_inds]
#     test = df.iloc[test_inds]
    
#     y_train = train['label']
#     X_train = train.drop(['label', 'sevenmers'], axis = 1)
#     y_test = test['label']
#     X_test = test.drop(['label', 'sevenmers'], axis = 1)
    
#     return X_train, y_train, X_test, y_test

# oversample and undersample such that ratio of minority to majority samples becomes 3:4
# input: df, df (X_train, y_train)
# output: df, df (resampled version)
def resample(X_train, y_train):
    # define oversampling strategy so that ratio of minority samples to majority samples is 1:2
    oversample = RandomOverSampler(sampling_strategy=0.5, random_state=42)
    X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)
    
    # define undersampling strategy so that the ratio of minority to majority samples becomes 3:4
    under = RandomUnderSampler(sampling_strategy=0.75)
    X_train_under, y_train_under = under.fit_resample(X_train_over, y_train_over)
    return X_train_under, y_train_under

df = pd.read_csv(input_path)
features_nominal = ['order_1', 'order_2', 'order_3', 'order_6', 'order_7']
df[features_nominal] = df[features_nominal].astype('category')

if (input_path == '../data/grouped_data.csv'):
    # X_train, y_train, X_test, y_test = split(df)
    y_train = df['label']
    X_train = df.drop(['label', 'sevenmers'], axis = 1)
    X_train, y_train = resample(X_train, y_train)
    X_train = X_train.drop(columns=['gene_id', 'transcript_id'])
    # X_test = X_test.drop(columns=['gene_id', 'transcript_id'])

# Normalisation (all)

In [6]:
# normalise the numerical columns
# input: df
# output: normalised df
def normalise(df):
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
    string_columns = df.select_dtypes(include=['object', 'category']).columns
    
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(df[numerical_columns])
    
    df_normalised = pd.DataFrame(x_scaled)
    df_normalised.columns = numerical_columns
    
    final_df = pd.concat([df[string_columns].reset_index(), df_normalised], axis=1)
    final_df = final_df.drop(columns = ['index'])
    return final_df

In [7]:
# if it's the train data, normalise after splitting
if (input_path == '../data/grouped_data.csv'):
    X_train_norm = normalise(X_train)
    # X_test_norm = normalise(X_test)
else:
    df_norm = normalise(df)

In [9]:
# print column name if there's null values in df
def check(df):
    for name in df.columns:
        if (df[name].isnull().any()):
            print(name)

if (input_path == '../data/grouped_data.csv'):
    check(X_train_norm)
    # check(X_test_norm)
else:
    check(df_norm)

In [24]:
# normalise for dataset 0 
df_norm_d0 = normalise(df)

In [27]:
check(df_norm_d0)

# Export to csv

In [23]:
if (input_path == '../data/grouped_data.csv'):
    X_train_norm.to_csv('../data/X_train_final.csv', index=False)
    # X_test_norm.to_csv('../data/X_test_final.csv', index=False)
    y_train.to_csv('../data/y_train_final.csv', index=False)
    # y_test.to_csv('../data/y_test_final.csv', index=False)

elif (input_path == '../data/grouped_data1.csv'):
    df_norm.to_csv('../data/final_data1.csv', index=False)
    
else:
    df_norm.to_csv('../data/final_data2.csv', index=False)

In [25]:
df_norm_d0.to_csv('../data/final_data0.csv', index=False)