# Feature Selection from Mobile data using SequentialFeatureSelector Forward method
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [7]:
import pandas as pd

In [8]:
url = "https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv"

In [9]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [11]:
df.price_range.value_counts()

1    500
2    500
3    500
0    500
Name: price_range, dtype: int64

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Import the necessary libraries

In [12]:
import sys
import pandas as pd  
import numpy as np
#from pandas_profiling import ProfileReport

### Read the mobile data

In [13]:
df.head()

#ProfileReport(df)

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


### Split the dataset into X and y

In [14]:
X = df.iloc[:,0:20]
y = df.iloc[:,-1] 

### Sanity check

In [15]:
X.shape, y.shape

((2000, 20), (2000,))

### How many features

In [16]:
X.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi'],
      dtype='object')

### Import the SequentialFeatureSelector from mlxtend library

In [17]:
from mlxtend.feature_selection import SequentialFeatureSelector

### Import the Logistic Regression model

In [18]:
from sklearn.linear_model import LogisticRegression

### Build a Logistic Regression model with lbfgs as solver and iterations = 500

In [19]:
lr = LogisticRegression(solver='lbfgs', max_iter= 500, 
                        random_state = 10, n_jobs = -1)

### Build SequentialFeatureSelector with Logistic Regression model and Forward = True and Choosing Best K_features

In [20]:
ffs = SequentialFeatureSelector(lr, forward = True, 
                                k_features= 'best', n_jobs= -1)

### Train the SequentialFeatureSelector model

In [21]:
ffs.fit(X,y)

SequentialFeatureSelector(estimator=LogisticRegression(max_iter=500, n_jobs=-1,
                                                       random_state=10),
                          k_features='best', n_jobs=-1)

### Explore the best feature names from the model

In [22]:
ffs.k_feature_names_

('battery_power', 'px_height', 'px_width', 'ram')

### Make a list of feature names

In [23]:
feature_names = list(ffs.k_feature_names_)

In [24]:
feature_names

['battery_power', 'px_height', 'px_width', 'ram']

## Optional - Find out if feature selection is useful in our model's accuracy

### Find the model's accuracy on all features X

### Train the model with all features

In [25]:
full_feature_model = lr.fit(X,y)

### Predict on all X

In [26]:
y_pred = full_feature_model.predict(X)
y_pred

array([2, 2, 2, ..., 3, 0, 3], dtype=int64)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
print(accuracy_score(y_pred,y))

0.6955


### Accuracy ???

### Lets build a model with the best features and find accuracy

### Train the model with the chosen best features

In [29]:
X_new = X[feature_names]

### Predict

In [30]:
best_feature_model = lr.fit(X_new,y)
y_pred_best = best_feature_model.predict(X_new)
y_pred_best

array([1, 2, 2, ..., 3, 0, 3], dtype=int64)

In [31]:
print(accuracy_score(y_pred_best,y))

0.9615


### Accuracy ?

### Try KNN as the Learning algorithm

In [32]:
from sklearn.neighbors import KNeighborsClassifier

### Choose KNN as your learning algorithm

In [33]:
knn = KNeighborsClassifier(n_neighbors=5)

### Build a SequentialFeatureSelector model with KNN as learning algorithm and Forward Selection as the Strategy

In [34]:
ffs_knn = SequentialFeatureSelector(knn, k_features='best', forward= True, n_jobs= -1)

### Train the model

In [35]:
ffs_knn.fit(X,y)

SequentialFeatureSelector(estimator=KNeighborsClassifier(), k_features='best',
                          n_jobs=-1)

### Get the feature names

In [36]:
ffs_knn.k_feature_names_

('battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram')

In [37]:
features_knn = list(ffs_knn.k_feature_names_)

In [38]:
knn_best_model = knn.fit(X[features_knn], y)

y_knn = knn_best_model.predict(X[features_knn])

accuracy_score(y_knn, y)

0.954

### Try some other learning algorithms you know of
- SVM
- Random Forest
- Anything of your choice

### Summarize the list of features chosen by different algorithms
- Algorithm | Best Features

Example: 

- Logistic Regression | ['battery_power', 'px_height', 'px_width', 'ram']
- KNN | ????
- SVM | ????
- Algorithms of your choice | ?????