# Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# xgboost
from sklearn import metrics
from xgboost import XGBClassifier

# PCA
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo
from factor_analyzer import FactorAnalyzer
from sklearn.decomposition import PCA
from kneed import KneeLocator

# logreg / rfe
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# to display all rows in dataframes
pd.set_option('display.max_rows', None) 

  from pandas import MultiIndex, Int64Index


# Load data

In [2]:
df = pd.read_csv("../data/grouped_data.csv")
X_train = pd.read_csv("../data/X_train_final.csv")
X_test = pd.read_csv("../data/X_test_final.csv")
y_train = pd.read_csv("../data/y_train_final.csv")
y_test = pd.read_csv("../data/y_test_final.csv")

In [3]:
# one hot encode the categories
features_nominal = ['order_1', 'order_2', 'order_3', 'order_6', 'order_7']
X_train = pd.get_dummies(X_train, columns = features_nominal)
X_test = pd.get_dummies(X_test, columns = features_nominal)

# Feature Selection

In [4]:
impt_feat = []

## XGB feature importance

In [5]:
# fit model to training data
xgb = XGBClassifier(eval_metric = ['logloss'], use_label_encoder=False)
xgb.fit(X_train, y_train.values.ravel())

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, xgb.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame(feats.items(), columns=['Feature', 'Importance'])
#.rename(columns={0: 'importance'})
importances = importances.sort_values(by = ['Importance'], ascending = False)
impt_feat.extend(importances.Feature.iloc[0:30].tolist())

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


## PCA dimensionality reduction

Remove categorical features

In [6]:
df_pca = df.drop(columns = ['label', 'sevenmers', 'gene_id', 'transcript_id', 'order_1', 'order_2', 'order_3', 'order_6', 'order_7'])

In [7]:
fa = FactorAnalyzer(n_factors = 10, method = 'principal', rotation='varimax')
fa.fit(df_pca)
eigenvalues, _ = fa.get_eigenvalues()
variances = fa.get_factor_variance()

In [8]:
def evaluate_pcs(num_of_pcs,data):
    def encode_vals(x):
        if x <= -0.7 or x >= 0.7:
            return x
        else:
            return("")    
    # REMARK: we use 'principal' method and 'varimax' rotation in the FactorAnalyzer function.
    f = FactorAnalyzer(n_factors=num_of_pcs, method = 'principal',rotation='varimax')
    f.fit(data)
    loadings = pd.DataFrame(f.loadings_).set_index(data.columns)
    loadings = loadings.applymap(encode_vals)
    loadingcols= list(loadings.columns)
    newcols = {}
    for i in loadingcols:
        newcols[i] = "PC" + str(i+1)
    loadings.rename(columns = newcols,inplace=True)
    return loadings

# The following function generates the rotation matrix. Recall that we use
# this matrix to determine if the PCs generated are easily understandable and appropriate.
# The argument "num_of_pcs" specifies, the number of PCs we wish to generate.

In [9]:
PCA_df = evaluate_pcs(9,df_pca)

In [10]:
value = []
which = lambda lst:list(np.where(lst)[0])
for i in PCA_df.columns:
    value.extend(which(PCA_df[i] == ''))
    
value = pd.DataFrame(value).rename(columns = {0: 'rowno'})
val_counts = pd.DataFrame(value.value_counts()).reset_index()

In [11]:
drop_cols = val_counts[val_counts[0] == 9]['rowno']

In [12]:
keep_df = df.drop(columns = list(PCA_df.index[drop_cols]))

In [13]:
impt_feat.extend(keep_df.columns)

## RFE Recursive Feature Elimination

In [14]:
logreg = LogisticRegression(max_iter=1000)

In [15]:
rfe3 = RFE(logreg, n_features_to_select=30)
rfe3 = rfe3.fit(X_train, y_train.values.ravel())

In [16]:
# cols remaining
cols_keep = X_train.columns.values[rfe3.support_]
impt_feat.extend(cols_keep)

## Feature Importance using Random Forest

In [17]:
# baseline model with default parameters
forest1 = RandomForestClassifier(random_state = 1, n_jobs= -1)
forest1.fit(X_train,y_train.values.ravel())

RandomForestClassifier(n_jobs=-1, random_state=1)

In [18]:
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, forest1.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame(feats.items(), columns=['Feature', 'Importance'])
#.rename(columns={0: 'importance'})
importances = importances.sort_values(by = ['Importance'], ascending = False)

In [19]:
impt_feat.extend(importances.Feature.iloc[0: 30].tolist())

# Check the Frequencies

In [20]:
# select feature counts >= 2
features = pd.DataFrame(impt_feat).rename(columns = {0: 'feat'})
feature_count = pd.DataFrame(features.value_counts()).reset_index()
impt_feat = feature_count[feature_count[0] >= 2]['feat']

# Remove Collinear Features

In [21]:
# https://stackoverflow.com/questions/29294983/how-to-calculate-correlation-between-all-columns-and-remove-highly-correlated-on
def remove_collinear_features(df_model, target_var, threshold, verbose, final_features):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold and which have the least correlation with the target (dependent) variable. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        df_model: features dataframe
        target_var: target (dependent) variable
        threshold: features with correlations greater than this value are removed
        verbose: set to "True" for the log printing

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = df_model.drop(target_var, 1).corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []
    dropped_feature = ""

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1): 
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                # Print the correlated features and the correlation value
                if verbose:
                    print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                col_value_corr = df_model[col.values[0]].corr(df_model[target_var])
                row_value_corr = df_model[row.values[0]].corr(df_model[target_var])
                if verbose:
                    print("{}: {}".format(col.values[0], np.round(col_value_corr, 3)))
                    print("{}: {}".format(row.values[0], np.round(row_value_corr, 3)))
                if col_value_corr < row_value_corr:
                    drop_cols.append(col.values[0])
                    dropped_feature = "dropped: " + col.values[0]
                else:
                    drop_cols.append(row.values[0])
                    dropped_feature = "dropped: " + row.values[0]
                if verbose:
                    print(dropped_feature)
                    print("-----------------------------------------------------------------------------")

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    df_model = df_model.drop(columns=drops)

    #print("dropped columns: ")
    #print(list(drops))
    #print("-----------------------------------------------------------------------------")
    #print("final columns: ")
    #print(df_model.columns.tolist())
    final_features = final_features.extend(df_model.columns.tolist())

    #return df_model

In [22]:
X_train = X_train[impt_feat]
df = pd.concat([X_train, y_train], axis=1)
final_features = []
remove_collinear_features(df, 'label', 0.9, False, final_features)
final_features.remove('label')

  corr_matrix = df_model.drop(target_var, 1).corr()


In [23]:
final_features

['mean_current_3_median',
 'sd_current_2_std',
 'mean_current_1_median',
 'mean_current_1_min',
 'mean_current_1_std',
 'mean_current_2_std',
 'count_G',
 'mean_current_3_min',
 'diff_sd_current_1_std',
 'mean_current_3_std',
 'order_6_T',
 'sd_current_2_min',
 'sd_current_3_median',
 'diff_sd_current_2_std',
 'diff_mean_current_2_std',
 'mean_current_3_max',
 'relative_position',
 'mean_current_2_max',
 'diff_mean_current_1_min',
 'sd_current_2_max',
 'sd_current_1_median',
 'dwelling_time_3_max',
 'diff_sd_current_2_median',
 'sd_current_3_max',
 'sd_current_3_std',
 'order_2_G',
 'diff_sd_current_1_median']