# Filter Methods

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")



In [2]:
df=pd.read_csv(r'C:\\Users\kmmoh\Downloads\santander-customer-satisfaction\train.csv',nrows=10000)


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\\\Users\\kmmoh\\Downloads\\santander-customer-satisfaction\\train.csv'

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
X_train=df.drop(columns=['TARGET'],axis=1)
y=df['TARGET']

# Feature Selection- Dropping constant features
In this step we will be removing the features which have constant features which are actually not important for solving the problem statement

In [None]:
# using sklearn variancethreshold to find constant features
from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(X_train) # fit finds the features with zero variance

In [None]:
X_train.columns[var_thres.get_support()] # non constant columns
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

In [None]:
X_train.drop(constant_columns, axis=1,inplace=True)

In [None]:
X_train.shape

# Removing quasi-constant features
Using variance threshold from sklearn
Variance threshold from sklearn is a simple baseline approach to feature selection. It removes all features which variance doesn’t meet some threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

Here, I will change the default threshold to remove almost / quasi-constant features.

In [None]:
var_thres=VarianceThreshold(threshold=0.1)
var_thres.fit(X_train) # fit finds the features with zero variance  # 0.1 indicates 99% of observations approximately

  # fit finds the features with low variance
    
X_train.columns[var_thres.get_support()] # non constant columns
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

# quasi-constant features(0.1 threshold)

In [None]:
X_train['ind_var1_0'].value_counts() # example

In [None]:
X_train.drop(constant_columns, axis=1,inplace=True)

In [None]:
X_train.shape

# 2. Feature Selection- With Correlation¶

In [None]:
import seaborn as sns
#Using Pearson Correlation
#plt.figure(figsize=(12,10))
cor = X_train.corr()
#sns.heatmap(cor, annot=True, cmap=plt.cm.CMRmap_r)

In [None]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
corr_features = correlation(X_train, 0.9)
len(set(corr_features))

In [None]:
#corr_features

In [None]:
X_train.drop(corr_features, axis=1,inplace=True)

In [None]:
X_train.shape

# Remove features with greater than a threshold percentage of missing values

In [None]:
# Train missing values (in percent)
train_missing = (X_train.isnull().sum() / len(X_train)).sort_values(ascending = False)
train_missing.head(5)

In [None]:
# There are no missing values 
# Identify missing values above threshold
#train_missing = train_missing.index[train_missing > 0.75]
#print(There are 0 columns with more than 75% missing values len(train_missing ))

In [None]:
for column in X_train.columns:
    if X_train[column].dtypes==object :
        print(column)  

In [None]:
# There are no categorical columns

In [None]:
# here for simplicity I will use only numerical variables
# select numerical columns:

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_vars = list(X_train.select_dtypes(include=numerics).columns)
data = X_train[numerical_vars]
data.shape

In [None]:
for column in X_train.columns:
    if X_train[column].dtypes==object :
     print(column).head(10)

In [None]:
X_train.var()

In [None]:
for column in X_train.columns:
    if X_train[column].dtypes!=object :
        print(column, X_train[column].value_counts().unique())  

Since the LightGBM model does not need missing values to be imputed, we can directly fit on the training data. We will use Early Stopping to determine the optimal number of iterations and run the model twice, averaging the feature importances to try and avoid overfitting to a certain set of features.

In [None]:
#!pip install lightgbm

In [None]:
# modeling 
import lightgbm as lgb
# Initialize an empty array to hold feature importances
feature_importances = np.zeros(X_train.shape[1])

# Create the model with several hyperparameters
model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
    X_train,
    y,
    test_size=0.3,
    random_state=0)
model.fit(X_train, y, early_stopping_rounds=100, eval_set = [(x_test, y_test)], 
              eval_metric = 'auc', verbose = 200)
# Record the feature importances
feature_importances += model.feature_importances_
#feature_importances

feature_imp = pd.DataFrame({'feature': list(X_train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)
feature_imp.head()

In [None]:
# Find the features with zero importance
#zero_features = list(feature_importances[feature_importances['importance'] == 0.0]['feature'])
#print('There are %d features with 0.0 importance' % len(zero_features))
#feature_importances

In [None]:
df['TARGET'].value_counts()