## Feature Selection

### Chi-square Test

In [None]:
import seaborn as sns
import numpy as np
df = sns.load_dataset('titanic')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
##['sex','embarked','alone','pclass','Survived']
df=df[['sex','embarked','alone','pclass','survived']]
df.head()

In [None]:
df['sex']=np.where(df['sex']=="male",1,0)
df.head()

In [None]:
### Let's perform label encoding on sex column
### let's perform label encoding on embarked
ordinal_label = {k: i for i, k in enumerate(df['embarked'].unique(), 0)}
df['embarked'] = df['embarked'].map(ordinal_label)

In [None]:
df.head()

In [None]:
### let's perform label encoding on alone
df['alone']=np.where(df['alone']==True,1,0)

In [None]:
df.head()

In [None]:
### train Test split is usually done to avaoid overfitting
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(df[['sex','embarked','alone','pclass']],
                                              df['survived'],test_size=0.3,random_state=100)

In [None]:
X_train.head()

In [None]:
X_train.isnull().sum()

In [None]:
## Perform chi2 test
### chi2 returns 2 values
### Fscore and the pvalue
from sklearn.feature_selection import chi2
f_p_values=chi2(X_train,y_train)

In [None]:
f_p_values

In [None]:
import pandas as pd
p_values=pd.Series(f_p_values[1])
p_values.index=X_train.columns
p_values

In [None]:
p_values.sort_index(ascending=False)

### Correlation 

In [None]:
import pandas as pd
df=pd.read_csv('mobile_dataset.csv')
df.head()

In [None]:
df.corr()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
corr=df.iloc[:,:-1].corr()
top_features=corr.index
plt.figure(figsize=(20,20))
sns.heatmap(df[top_features].corr(),annot=True)

### Information gain

In [None]:
import pandas as pd
data = pd.read_csv("diabetes.csv")
data.head()

In [None]:
Y = data['Outcome']
X = data.iloc[:, 0:8]

In [None]:
from sklearn.feature_selection import mutual_info_classif
importance = mutual_info_classif(X,Y)
feat_importance = pd.Series(importance, data.columns[0: len(data.columns)-1])
feat_importance.plot(kind='barh', color='teal')

### Variance Threshold

In [None]:
from sklearn.feature_selection import VarianceThreshold

v_threshold = VarianceThreshold(threshold=0)
v_threshold.fit(X) #fit finds the features with zero variance
v_threshold.get_support()

## Wrapper Methods

### Forward Feature Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split



x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.30, 
                                                 random_state=335)

lr=LogisticRegression()

ffs = SequentialFeatureSelector(lr, k_features='best', forward=True, 
                                n_jobs=-1)
ffs.fit(X,Y)
features = list(ffs.k_feature_names_)
print(features)
lr.fit(x_train[features], y_train)
y_pred = lr.predict(x_train[features])

### Backward Feature Selction

In [None]:
lr = LogisticRegression(max_iter=500)
bfs = SequentialFeatureSelector(lr, k_features='best', forward=False, 
                                n_jobs=-1)
bfs.fit(X,Y)
features = list(bfs.k_feature_names_)
print(features)
lr.fit(x_train[features], y_train)
y_pred = lr.predict(x_train[features])

### Recursive Feature Elimiation

In [None]:
from sklearn.feature_selection import RFE
rfe = RFE(lr, n_features_to_select=7)
rfe.fit(x_train, y_train)
y_pred = rfe.predict(x_train)

## Embedded Methods

### LASSO Regularization (L1)

In [None]:
from sklearn.feature_selection import SelectFromModel

#set regularization parameter C=1
logistic = LogisticRegression(C=1, penalty='l1', solver='liblinear',
                             random_state=7).fit(X, Y)
model = SelectFromModel(logistic, prefit=True)

X_new = model.transform(X)


In [None]:
X_new

### Random Forest Importance

In [None]:
from sklearn.ensemble import RandomForestClassifier

#create the random forest with your hyperparamets
model = RandomForestClassifier(n_estimators=300)

# fit the model to start training
model.fit(X, Y)

# get the importance of the resulting features
importances = model.feature_importances_

# create a data frame for visualization
final_df = pd.DataFrame({'Features':pd.DataFrame(X).columns, 
                         "Importances":importances})
final_df.set_index("Importances")

#sort in ascending order for better visualization
final_df = final_df.sort_values('Importances')

#plot the feature importances in bars
final_df.plot.bar(color='teal')