In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import time
import sys,os
import warnings
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
warnings.filterwarnings('ignore')

In [3]:
sys.path.insert(0,'C:\\Users\\Faith Bagire\\PycharmProjects\\pythonProject\\causal_impact\\modules')

In [4]:
from read_data import read_data

In [5]:
#select features only
x=data.drop(['id','diagnosis'],axis=1)
y=data['diagnosis']

NameError: name 'data' is not defined

In [None]:
x.describe()

#### Feature Selection using Correlation

The simplest method for feature selection is correlation between independent variables. Most ML models perform well when they are trained on only important features and for our case, we have 30 exploratory features. Using correlation coefficients, we can assume that variables with high correlation are redundant t a model and remain with any (arbitrary) on among them.

In [None]:
plt.figure(figsize=(8,7))
_=sns.heatmap(x.corr())

In [None]:
corr=x.corr()

In [None]:
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False

In [None]:
# Selected column
cols_pass=x.columns[columns].to_list()
cols_dropped_corr=x.columns[~columns].to_list()

In [None]:
print('Number of selected cols: {}'.format(len(cols_pass)))
cols_pass

In [None]:
print('Number of removed cols: {}'.format(len(cols_dropped_corr)))
cols_dropped_corr

In [None]:
corr_dict=corr.to_dict()

In [None]:
high_col={}
for col in corr.columns:
    one_col=corr_dict[col]
    high_col[col]=[y for y in one_col if one_col[y]>=0.9 and one_col[y]< 1]

In [None]:
# Drop keys with empty list (Not highly correlated with any other variable)
high_col={k: v for k, v in high_col.items() if v}

In [None]:
high_col

By removing columns which are highly correlated (>=0.9, which is very high correlation usualy), we remain with 20 features

#### Tree based feature selection and random forest classification

In [None]:
x_new=x[cols_pass]

In [None]:
# split data train 80 % and test 20 %
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
def random_forest_fimportance(x_train,y_train):
    clf_rf_5 = RandomForestClassifier()      
    clr_rf_5 = clf_rf_5.fit(x_train,y_train)
    importances = clr_rf_5.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(x_train.shape[1]):
        print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

    # Plot the feature importances of the forest

    plt.figure(1, figsize=(14, 7))
    plt.title("Feature importances")
    plt.bar(range(x_train.shape[1]), importances[indices])
    plt.xticks(range(x_train.shape[1]), x_train.columns[indices],rotation=90)
    plt.xlim([-1, x_train.shape[1]])
    plt.show()
    
    return clf_rf_5

In [None]:
rf_model1=random_forest_fimportance(x_train,y_train)

In [None]:
acc_score = accuracy_score(y_test,rf_model1.predict(x_test))
print('Accuracy is: ',acc_score)
confusion_mat = confusion_matrix(y_test,rf_model1.predict(x_test))
_=sns.heatmap(confusion_mat,annot=True,fmt="d")

In [None]:
x_train1, x_test1, y_train1, y_test1 = train_test_split(x_new, y, test_size=0.2, random_state=423)

In [None]:
rf_model2=random_forest_fimportance(x_train1,y_train1)

In [None]:
acc_score1 = accuracy_score(y_test1,rf_model2.predict(x_test1))
print('Accuracy is: {}'.format(acc_score1))
confusion_mat1 = confusion_matrix(y_test1,rf_model2.predict(x_test1))
_=sns.heatmap(confusion_mat1,annot=True,fmt="d",cbar=False)

By using confusion matrix accuracy score, we can see that the RF with 20 features actually has more accuracy that the model with 30 features

### Stepwise forward elimination

In [None]:
def backward_regression(X, y,
                           threshold_out,
                           verbose=False):
    included=list(X.columns)
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return pvalues,model

In [None]:
def forward_regression(X, y,
                       threshold_in,
                       verbose=False):
    initial_list = []
    included = list(initial_list)
    while True:
        changed=False
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        if not changed:
            break

    return model

In [None]:
model_fwd=forward_regression(x,y,0.05,verbose=True)

In [None]:
cols_pass_fwd=model_fwd.pvalues.to_dict()

In [None]:
p_values,model_bwd=backward_regression(x,y,0.6,verbose=True)

In [None]:
cols_dropped_bwd=list(p_values.index)

In [None]:
set(cols_dropped_bwd)&set(cols_dropped_corr)

In [None]:
set(cols_pass)&set(cols_pass_fwd)

Because with highly correlated variables, based on correlation method can pick any first variable
among two or three that are highly correlated, then I have used a simple rule by making sure
that variable that were removed by two methods (correlation& backward elimination) and those
variables that were selected by two methods (correlation & forward selection) are considered.
And then I can arbitrary select any one variable among highly correlated variables for those
that are not agreed on by two methods.

In [None]:
x.columns.difference(set(cols_pass)&set(cols_pass_fwd) | set(cols_dropped_bwd)&set(cols_dropped_corr))

#### Selecting the optimal columns 
*'area_se'* can be represented by *'radius_se'* \

*'texture_worst'* can be represented *'texture_mean'*

*'compactness_worst', 'concave points_se','fractal_dimension_mean','fractal_dimension_se','smoothness_mean',
'smoothness_worst','symmetry_mean','symmetry_se','texture_se'* are not correlated(>=0.9) with any other variable so they will be selected\

*'concave points_worst','concave points_mean','concavity_mean'*  I will select *'concave points_mean'*\

*'perimeter_worst','radius_mean', 'perimeter_mean', 'area_mean', 'radius_worst', 'area_worst'* are highly correlated, so i select *'radius_mean'* and *'radius_worst'*

In [None]:
final_cols=['compactness_worst','compactness_mean','compactness_se','concavity_se','concavity_worst',
            'fractal_dimension_worst','radius_se','smoothness_se','symmetry_worst','concave points_mean',
            'smoothness_mean','smoothness_worst','symmetry_mean','fractal_dimension_mean','radius_mean',
            'radius_worst','symmetry_se','texture_se']