# Feature Selection from Mobile data using SequentialFeatureSelector Backward method
Dataset: [https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv]

In [1]:
import pandas as pd

In [2]:
url = "https://raw.githubusercontent.com/subashgandyer/datasets/main/mobile_price_train.csv"

In [3]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [9]:
df.price_range.value_counts()

price_range
1    500
2    500
3    500
0    500
Name: count, dtype: int64

### Understand the data
- Find how many features?
- Find how many samples?
- What are the data types of each feature column?
- What do you think could be the most important feature(s)?
- Run some feature selection methods
- Is your intuition right?

### Import the necessary libraries

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

### Read the mobile data

In [15]:
#data = pd.read_csv("data/mobile_price_train.csv")
data = df

### Split the dataset into X and y

In [18]:
X = data.iloc[:,0:20]
y = data.iloc[:,-1] 

### Sanity check

In [21]:
X.shape, y.shape

((2000, 20), (2000,))

### How many features

In [24]:
X.shape[1]

20

### Import the SequentialFeatureSelector from mlxtend library

In [28]:
from mlxtend.feature_selection import SequentialFeatureSelector

### Import the Logistic Regression model

In [31]:
from sklearn.linear_model import LogisticRegression

### Build a Logistic Regression model with lbfgs as solver and iterations = 500

In [34]:
lr = LogisticRegression(class_weight='balanced',
                        solver='lbfgs', 
                        random_state=42, 
                        n_jobs=-1, 
                        max_iter=500
)

### Build SequentialFeatureSelector with Logistic Regression model and Forward = False and Choosing Best K_features

In [37]:
ffs = SequentialFeatureSelector(lr, k_features='best', forward=False, n_jobs=-1)

### Train the SequentialFeatureSelector model

In [40]:
ffs.fit(X, y)

### Explore the best feature names from the model

In [43]:
ffs.k_feature_names_

('ram', 'talk_time')

### Make a list of feature names

In [46]:
features = list(ffs.k_feature_names_)
features

['ram', 'talk_time']

## Optional - Find out if feature selection is useful in our model's accuracy

### Find the model's accuracy on all features X

### Train the model with all features

In [52]:
full_feature_model = lr.fit(X, y)

### Predict on all X

In [55]:
y_pred = full_feature_model.predict(X)
y_pred

array([2, 2, 2, ..., 3, 0, 3], dtype=int64)

In [57]:
Compares = pd.DataFrame()
Compares['Predictions'] = pd.Series(y_pred)
Compares['Actuals'] = pd.Series(y)

In [59]:
Compares['classmatch?'] = np.where(Compares['Predictions'] == Compares['Actuals'], 'True', 'False')
Compares

Unnamed: 0,Predictions,Actuals,classmatch?
0,2,1,False
1,2,2,True
2,2,2,True
3,2,2,True
4,1,1,True
...,...,...,...
1995,0,0,True
1996,1,2,False
1997,3,3,True
1998,0,0,True


In [61]:
Compares[Compares['classmatch?'] == 'False'].count()

Predictions    603
Actuals        603
classmatch?    603
dtype: int64

In [63]:
Compares[Compares['classmatch?'] == 'False'].groupby('Predictions').count()

Unnamed: 0_level_0,Actuals,classmatch?
Predictions,Unnamed: 1_level_1,Unnamed: 2_level_1
0,58,58
1,178,178
2,219,219
3,148,148


### Accuracy ???

In [65]:
# 計算正確的預測數量
correct_predictions = Compares['classmatch?'].value_counts()['True']

# 總預測數量
total_predictions = Compares.shape[0]

# 準確率
accuracy = correct_predictions / total_predictions
print(f"Accuracy Rate: {accuracy:.2%}")


Accuracy Rate: 69.85%


### Lets build a model with the best features and find accuracy

### Train the model with the chosen best features

In [24]:
best_feature_model = lr.fit(X[features], y)

### Predict

In [25]:
y_pred = best_feature_model.predict(X[features])
y_pred

array([2, 2, 2, ..., 3, 0, 3])

In [26]:
y

0       1
1       2
2       2
3       2
4       1
       ..
1995    0
1996    2
1997    3
1998    0
1999    3
Name: price_range, Length: 2000, dtype: int64

In [27]:
Compares = pd.DataFrame()

In [28]:
Compares['Predictions'] = pd.Series(y_pred)
Compares['Actuals'] = pd.Series(y)
Compares.shape

(2000, 2)

In [29]:
Compares.head()

Unnamed: 0,Predictions,Actuals
0,2,1
1,2,2
2,2,2
3,3,2
4,1,1


In [30]:
# accur = (preds == actuals)
Compares['classmatch?'] = np.where(Compares['Predictions'] == Compares['Actuals'], 'True', 'False')
Compares

Unnamed: 0,Predictions,Actuals,classmatch?
0,2,1,False
1,2,2,True
2,2,2,True
3,3,2,False
4,1,1,True
...,...,...,...
1995,0,0,True
1996,2,2,True
1997,3,3,True
1998,0,0,True


In [31]:
Compares[Compares['classmatch?'] == 'False'].groupby('Predictions').count()

Unnamed: 0_level_0,Actuals,classmatch?
Predictions,Unnamed: 1_level_1,Unnamed: 2_level_1
0,62,62
1,125,125
2,144,144
3,71,71


### Accuracy ?

### Try KNN as the Learning algorithm

In [32]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

### Choose KNN as your learning algorithm

In [33]:
knn = KNeighborsClassifier(n_neighbors=3)

### Build a SequentialFeatureSelector model with KNN as learning algorithm and Backward Selection as the Strategy

In [34]:
ffs_knn = SequentialFeatureSelector(knn, k_features='best', forward=False, n_jobs=-1)

### Train the model

In [35]:
ffs_knn.fit(X, y)

SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
                          forward=False, k_features='best', n_jobs=-1)

### Get the feature names

In [36]:
ffs_knn.k_feature_names_

('battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram')

In [37]:
features = list(ffs_knn.k_feature_names_)
features

['battery_power',
 'blue',
 'clock_speed',
 'dual_sim',
 'fc',
 'four_g',
 'm_dep',
 'mobile_wt',
 'n_cores',
 'pc',
 'px_height',
 'px_width',
 'ram']

### Try some other learning algorithms you know of
- SVM
- Random Forest
- Anything of your choice

### Summarize the list of features chosen by different algorithms
- Algorithm | Best Features

Example: 

- Logistic Regression | ['battery_power', 'px_height', 'px_width', 'ram']
- KNN | ????
- SVM | ????
- Algorithms of your choice | ?????