In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')


In [7]:
train=pd.read_csv('train.csv',sep=',')
test=pd.read_csv('test.csv',sep=',')

In [4]:
train.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,2,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,3,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,4,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,5,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


Variable Transformation is not only about creating new variables, but also making the available information more sensible. This stage involves making new variables using existing variables or perform some numerical transformations on variables like taking a log. During univariate analysis, we saw a number of categories with a very small percentage of observations. Let's recall the class of available variables.

In [29]:
train.dtypes

ID                 int64
Age                int64
Workclass         object
Education         object
Marital.Status    object
Occupation        object
Relationship      object
Race              object
Sex               object
Hours.Per.Week     int64
Native.Country    object
Income.Group      object
dtype: object

In [30]:
train['Workclass'].value_counts()/train.shape[0]

Private             0.693548
others              0.106855
Self-emp-not-inc    0.070565
Local-gov           0.068548
Name: Workclass, dtype: float64

In [31]:
test['Workclass'].value_counts()/test.shape[0]

Private             0.672741
others              0.122449
Self-emp-not-inc    0.080904
Local-gov           0.061224
Name: Workclass, dtype: float64

In [32]:
categories_to_combine=['State-gov','Federal-gov','Self-emp-inc']

In [33]:
for car in categories_to_combine:
    train['Workclass'].replace({car:'others'},inplace=True)
    test['Workclass'].replace({car:'others'},inplace=True)
    

In [34]:
train['Workclass'].value_counts()/train.shape[0]

Private             0.693548
others              0.106855
Self-emp-not-inc    0.070565
Local-gov           0.068548
Name: Workclass, dtype: float64

In [36]:
test['Workclass'].value_counts()/test.shape[0]

Private             0.672741
others              0.122449
Self-emp-not-inc    0.080904
Local-gov           0.061224
Name: Workclass, dtype: float64

Here we can see that the categories have been successfully combined. Note that combining is not the best possible technique for solving the problem of high cardinality, i.e. high number of unique values.

# Combining the rest:

In [19]:
categorical=list(train.dtypes.loc[train.dtypes=='object'].index)

In [20]:
categorical

['Workclass',
 'Education',
 'Marital.Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native.Country',
 'Income.Group']

In [21]:
categorical=categorical[1:]

In [22]:
categorical

['Education',
 'Marital.Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native.Country',
 'Income.Group']

In [23]:
train[categorical].apply(lambda x:len(x.unique()))

Education         16
Marital.Status     7
Occupation        14
Relationship       6
Race               5
Sex                2
Native.Country    26
Income.Group       3
dtype: int64

In [28]:
for column in categorical:
    ##determine the categories to combine
    freq=train[column].value_counts()/train.shape[0]
    cat_to_combine=freq.loc[freq.values<0.05].index

##loop for all categories and combine them as others    
    for cat in cat_to_combine:
        train[column].replace({cat:'others'},inplace=True)
        test[column].replace({cat:'others'},inplace=True)

In [29]:
train[categorical].apply(lambda x:len(x.unique()))

Education          6
Marital.Status     4
Occupation        10
Relationship       6
Race               3
Sex                2
Native.Country     3
Income.Group       3
dtype: int64

In [31]:
test[categorical[:-1]].apply(lambda x:len(x.unique()))

Education          6
Marital.Status     4
Occupation        12
Relationship       7
Race               4
Sex                3
Native.Country    11
dtype: int64

In [32]:
categorical

['Education',
 'Marital.Status',
 'Occupation',
 'Relationship',
 'Race',
 'Sex',
 'Native.Country',
 'Income.Group']

In [37]:
train.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,others,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,2,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,3,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,4,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,5,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [47]:
categorical=train.dtypes[train.dtypes=='object'].index

In [48]:
categorical

Index(['Workclass', 'Education', 'Marital.Status', 'Occupation',
       'Relationship', 'Race', 'Sex', 'Native.Country', 'Income.Group'],
      dtype='object')

In [50]:
train[categorical].head()

Unnamed: 0,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Native.Country,Income.Group
0,others,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [56]:
train['Education'].unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [58]:
def uni(x):
    return len(x.unique())
train[categorical].apply(uni,axis=0)

Workclass          5
Education         16
Marital.Status     7
Occupation        14
Relationship       6
Race               5
Sex                2
Native.Country    26
Income.Group       3
dtype: int64

In [60]:
categorical_test=test.dtypes[test.dtypes=='object'].index

In [63]:
test[categorical_test].apply(uni,axis=0)

Workclass          5
Education         16
Marital.Status     7
Occupation        16
Relationship       7
Race               6
Sex                3
Native.Country    31
dtype: int64

In [65]:
train.head()

Unnamed: 0,ID,Age,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Hours.Per.Week,Native.Country,Income.Group
0,1,39,others,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,2,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,3,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,4,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,5,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [68]:
train[categorical].head()

Unnamed: 0,Workclass,Education,Marital.Status,Occupation,Relationship,Race,Sex,Native.Country,Income.Group
0,others,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
