In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv')
data.shape

(2000, 21)

In [3]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [4]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [5]:
X.shape

(2000, 20)

In [6]:
mu = X.mean(axis=0)
sig = X.std(axis=0)

X = (X-mu)/sig

In [7]:
from sklearn.feature_selection import SelectKBest

In [8]:
best_features = SelectKBest()

In [9]:
fit = best_features.fit(X, y)
fit.scores_

array([3.15981575e+01, 4.76767709e-01, 4.93707801e-01, 4.28239286e-01,
       7.72181960e-01, 1.05952453e+00, 2.92299608e+00, 1.50068244e+00,
       3.59431819e+00, 2.62541515e+00, 8.25446358e-01, 1.94848418e+01,
       2.26208825e+01, 3.52011082e+03, 2.22598374e+00, 1.67099983e+00,
       1.62881131e+00, 4.57319750e-01, 1.29330223e+00, 2.84940470e-01])

In [10]:
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

In [12]:
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['Features', 'Scores']

In [13]:
featureScores

Unnamed: 0,Features,Scores
0,battery_power,31.598158
1,blue,0.476768
2,clock_speed,0.493708
3,dual_sim,0.428239
4,fc,0.772182
5,four_g,1.059525
6,int_memory,2.922996
7,m_dep,1.500682
8,mobile_wt,3.594318
9,n_cores,2.625415


In [15]:
featureScores = featureScores.sort_values(by='Scores', ascending=False)
featureScores

Unnamed: 0,Features,Scores
13,ram,3520.110824
0,battery_power,31.598158
12,px_width,22.620882
11,px_height,19.484842
8,mobile_wt,3.594318
6,int_memory,2.922996
9,n_cores,2.625415
14,sc_h,2.225984
15,sc_w,1.671
16,talk_time,1.628811


In [20]:
top_10 = list(featureScores[:10]['Features'].values)
top_10

['ram',
 'battery_power',
 'px_width',
 'px_height',
 'mobile_wt',
 'int_memory',
 'n_cores',
 'sc_h',
 'sc_w',
 'talk_time']

# Model Comparison

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [22]:
model = LogisticRegression()
model.fit(X, y)

In [23]:
scores = cross_val_score(model, X, y, cv = 10)
scores.mean()

0.9620000000000001

In [25]:
X[top_10]

Unnamed: 0,ram,battery_power,px_width,px_height,mobile_wt,int_memory,n_cores,sc_h,sc_w,talk_time
0,0.391605,-0.902372,-1.146497,-1.408596,1.348911,-1.380298,-1.101696,-0.784787,0.283032,1.462128
1,0.467200,-0.495015,1.704039,0.585631,-0.120029,1.154735,-0.664602,1.113987,-0.635158,-0.734084
2,0.441387,-1.537302,1.074699,1.392336,0.134210,0.493422,0.209587,-0.310094,-0.864705,-0.368048
3,0.594421,-1.418964,1.236662,1.286428,-0.261274,-1.214970,0.646681,0.876640,0.512579,-0.002013
4,-0.657502,1.325574,-0.091429,1.268401,0.021215,0.658751,-1.101696,-1.022134,-0.864705,0.730057
...,...,...,...,...,...,...,...,...,...,...
1995,-1.342463,-1.011607,1.477291,1.299948,-0.967495,-1.655845,0.646681,0.164600,-0.405610,1.462128
1996,-0.085010,1.653280,1.650822,0.608165,1.320663,0.383203,-0.227507,-0.310094,0.971674,0.913075
1997,0.859924,1.530391,0.880345,0.502257,-0.910998,0.217875,1.520869,-0.784787,-1.094253,-1.100119
1998,-1.157164,0.622372,-1.345480,-0.696533,0.134210,0.768969,0.209587,1.351334,0.971674,1.462128


In [26]:
scores = cross_val_score(model, X[top_10], y, cv = 10)
scores.mean()

0.9710000000000001