In [1]:
import pandas as pd

In [3]:
emp_data = pd.read_csv('Data/HR_comma_sep.csv.txt')

In [10]:
#emp_data.info()

In [11]:
emp_data.sales.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [12]:
emp_data.rename(columns={'sales':'dept'},inplace=True)

In [13]:
emp_data.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium


In [14]:
feature_data = emp_data.drop(['left'],axis=1)

In [15]:
target_data = emp_data.left

In [16]:
feature_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,0,sales,low
1,0.8,0.86,5,262,6,0,0,sales,medium
2,0.11,0.88,7,272,4,0,0,sales,medium
3,0.72,0.87,5,223,5,0,0,sales,low
4,0.37,0.52,2,159,3,0,0,sales,low


### Feature Selection Technique

In [17]:
from sklearn.feature_selection import VarianceThreshold

In [23]:
vt = VarianceThreshold(0.1)

In [25]:
vt.fit_transform(feature_data.drop(['dept','salary'],axis=1))

array([[  2., 157.,   3.,   0.],
       [  5., 262.,   6.,   0.],
       [  7., 272.,   4.,   0.],
       ...,
       [  2., 143.,   3.,   0.],
       [  6., 280.,   4.,   0.],
       [  2., 158.,   3.,   0.]])

In [26]:
vt.variances_

array([6.18130792e-02, 2.92969110e-02, 1.51918262e+00, 2.49414688e+03,
       2.13185567e+00, 1.23697692e-01, 2.08157531e-02])

In [30]:
vt.inverse_transform(vt.fit_transform(feature_data.drop(['dept','salary'],axis=1))).shape

(14999, 7)

In [31]:
from sklearn.feature_selection import SelectKBest

In [32]:
select = SelectKBest(k=4)

In [37]:
select.fit_transform(feature_data.drop(['dept','salary'],axis=1),target_data)

array([[3.80e-01, 1.57e+02, 3.00e+00, 0.00e+00],
       [8.00e-01, 2.62e+02, 6.00e+00, 0.00e+00],
       [1.10e-01, 2.72e+02, 4.00e+00, 0.00e+00],
       ...,
       [3.70e-01, 1.43e+02, 3.00e+00, 0.00e+00],
       [1.10e-01, 2.80e+02, 4.00e+00, 0.00e+00],
       [3.70e-01, 1.58e+02, 3.00e+00, 0.00e+00]])

In [38]:
feature_data.average_montly_hours.min()

96

In [39]:
feature_data.average_montly_hours.max()

310

In [40]:
cols_no_tf = list(set(feature_data.columns) - set(['average_montly_hours', 'salary', 'dept']))

In [41]:
cols_no_tf

['satisfaction_level',
 'number_project',
 'last_evaluation',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years']

In [42]:
no_tf_cols_dm = list(map(lambda x:(x,None), cols_no_tf))

In [43]:
no_tf_cols_dm

[('satisfaction_level', None),
 ('number_project', None),
 ('last_evaluation', None),
 ('time_spend_company', None),
 ('Work_accident', None),
 ('promotion_last_5years', None)]

In [48]:
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

In [49]:
mapper = DataFrameMapper([
    *no_tf_cols_dm,
    (['average_montly_hours'],MinMaxScaler()),
    ('salary',LabelEncoder()),
    ('dept',LabelEncoder())
])

In [50]:
mapper

DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('number_project', None), ('last_evaluation', None), ('time_spend_company', None), ('Work_accident', None), ('promotion_last_5years', None), (['average_montly_hours'], MinMaxScaler(copy=True, feature_range=(0, 1))), ('salary', LabelEncoder()), ('dept', LabelEncoder())],
        input_df=False, sparse=False)

In [63]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [68]:
pipe = Pipeline([
    ('mapper',mapper),
    ('select',select),
    ('logisticRegression',RandomForestClassifier())
])

In [69]:
pipe.steps

[('mapper', DataFrameMapper(default=False, df_out=False,
          features=[('satisfaction_level', None), ('number_project', None), ('last_evaluation', None), ('time_spend_company', None), ('Work_accident', None), ('promotion_last_5years', None), (['average_montly_hours'], MinMaxScaler(copy=True, feature_range=(0, 1))), ('salary', LabelEncoder()), ('dept', LabelEncoder())],
          input_df=False, sparse=False)),
 ('select',
  SelectKBest(k=4, score_func=<function f_classif at 0x00000249CCE5F9D8>)),
 ('logisticRegression',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False))]

In [70]:
pipe.fit(feature_data,target_data)



Pipeline(memory=None,
     steps=[('mapper', DataFrameMapper(default=False, df_out=False,
        features=[('satisfaction_level', None), ('number_project', None), ('last_evaluation', None), ('time_spend_company', None), ('Work_accident', None), ('promotion_last_5years', None), (['average_montly_hours'], MinMaxScaler(copy=Tru...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [71]:
pipe.score(feature_data,target_data)

0.9939329288619241

In [72]:
from sklearn.decomposition import PCA

In [73]:
pca = PCA(n_components=2)

In [74]:
from sklearn.datasets import load_iris

In [75]:
iris = load_iris()

In [77]:
iris.data[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [78]:
pca.fit_transform(iris.data)

array([[-2.68420713,  0.32660731],
       [-2.71539062, -0.16955685],
       [-2.88981954, -0.13734561],
       [-2.7464372 , -0.31112432],
       [-2.72859298,  0.33392456],
       [-2.27989736,  0.74778271],
       [-2.82089068, -0.08210451],
       [-2.62648199,  0.17040535],
       [-2.88795857, -0.57079803],
       [-2.67384469, -0.1066917 ],
       [-2.50652679,  0.65193501],
       [-2.61314272,  0.02152063],
       [-2.78743398, -0.22774019],
       [-3.22520045, -0.50327991],
       [-2.64354322,  1.1861949 ],
       [-2.38386932,  1.34475434],
       [-2.6225262 ,  0.81808967],
       [-2.64832273,  0.31913667],
       [-2.19907796,  0.87924409],
       [-2.58734619,  0.52047364],
       [-2.3105317 ,  0.39786782],
       [-2.54323491,  0.44003175],
       [-3.21585769,  0.14161557],
       [-2.30312854,  0.10552268],
       [-2.35617109, -0.03120959],
       [-2.50791723, -0.13905634],
       [-2.469056  ,  0.13788731],
       [-2.56239095,  0.37468456],
       [-2.63982127,