In [46]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.preprocessing import normalize

In [47]:
train = pd.read_csv('/data/project2/train.csv')
test = pd.read_csv('/data/project2/test.csv')

## Total Number of Electronics

In [48]:
def getTotalElectronics(data):
    data['TotalElectronics'] = data['Number of Landline/wireless telephones'] + data['Number of Cellular phone'] + data['Number of Personal Computer']

In [49]:
getTotalElectronics(train)
getTotalElectronics(test)

## Total Number of Motorized Vehicles

In [50]:
def getNumberOfVehicles(data):
    data['NumberOfVehicles'] = data['Number of Motorized Banca'] + data['Number of Motorcycle/Tricycle']

In [51]:
getNumberOfVehicles(train)
getNumberOfVehicles(test)

## Housing / Water  Expenditure x Toilet Facility

In [52]:
np.unique(list(train['Toilet Facilities']))

array(['Closed pit', 'None', 'Open pit', 'Others',
       'Water-sealed, other depository, shared with other household',
       'Water-sealed, other depository, used exclusively by household',
       'Water-sealed, sewer septic tank, shared with other household',
       'Water-sealed, sewer septic tank, used exclusively by household'], 
      dtype='<U62')

In [66]:
def waterExpenditureByToilet(data):
    s = pd.get_dummies(data['Toilet Facilities'])
    del s['None']
    data = pd.concat([data,s],axis = 1)
    data['ClosedPitxWaterExp'] = data['Housing and water Expenditure'] * data['Closed pit']
    data['OpenPitxWaterExp'] = data['Housing and water Expenditure'] * data['Open pit']
    data['OthersxWaterExp'] = data['Housing and water Expenditure'] * data['Others']
    data['OtherSharedxWaterExp'] = data['Housing and water Expenditure'] * data['Water-sealed, other depository, shared with other household']
    data['OtherExclusivexWaterExp'] = data['Housing and water Expenditure'] * data['Water-sealed, other depository, used exclusively by household']
    data['SewerSharedxWaterExp'] = data['Housing and water Expenditure'] * data['Water-sealed, sewer septic tank, shared with other household']
    data['SewerExclusivexWaterExp'] = data['Housing and water Expenditure'] * data['Water-sealed, sewer septic tank, used exclusively by household']
    

In [70]:
#waterExpenditureByToilet(train)
#waterExpenditureByToilet(test)

In [72]:
## creating interaction for training set
s = pd.get_dummies(train['Toilet Facilities'])
del s['None']
train = pd.concat([train,s],axis = 1)
train['ClosedPitxWaterExp'] = train['Housing and water Expenditure'] * train['Closed pit']
train['OpenPitxWaterExp'] = train['Housing and water Expenditure'] * train['Open pit']
train['OthersxWaterExp'] = train['Housing and water Expenditure'] * train['Others']
train['OtherSharedxWaterExp'] = train['Housing and water Expenditure'] * train['Water-sealed, other depository, shared with other household']
train['OtherExclusivexWaterExp'] = train['Housing and water Expenditure'] * train['Water-sealed, other depository, used exclusively by household']
train['SewerSharedxWaterExp'] = train['Housing and water Expenditure'] * train['Water-sealed, sewer septic tank, shared with other household']
train['SewerExclusivexWaterExp'] = train['Housing and water Expenditure'] * train['Water-sealed, sewer septic tank, used exclusively by household']

In [74]:
## creating interaction for testing set
s = pd.get_dummies(test['Toilet Facilities'])
del s['None']
test = pd.concat([test,s],axis = 1)
test['ClosedPitxWaterExp'] = test['Housing and water Expenditure'] * test['Closed pit']
test['OpenPitxWaterExp'] = test['Housing and water Expenditure'] * test['Open pit']
test['OthersxWaterExp'] = test['Housing and water Expenditure'] * test['Others']
test['OtherSharedxWaterExp'] = test['Housing and water Expenditure'] * test['Water-sealed, other depository, shared with other household']
test['OtherExclusivexWaterExp'] = test['Housing and water Expenditure'] * test['Water-sealed, other depository, used exclusively by household']
test['SewerSharedxWaterExp'] = test['Housing and water Expenditure'] * test['Water-sealed, sewer septic tank, shared with other household']
test['SewerExclusivexWaterExp'] = test['Housing and water Expenditure'] * test['Water-sealed, sewer septic tank, used exclusively by household']

## Clustering Jobs

In [21]:
train.columns

Index(['Index', 'Region', 'Total Food Expenditure', 'Main Source of Income',
       'Agricultural Household indicator', 'Bread and Cereals Expenditure',
       'Total Rice Expenditure', 'Meat Expenditure',
       'Total Fish and  marine products Expenditure', 'Fruit Expenditure',
       'Vegetables Expenditure', 'Restaurant and hotels Expenditure',
       'Alcoholic Beverages Expenditure', 'Tobacco Expenditure',
       'Clothing, Footwear and Other Wear Expenditure',
       'Housing and water Expenditure', 'Imputed House Rental Value',
       'Medical Care Expenditure', 'Transportation Expenditure',
       'Communication Expenditure', 'Education Expenditure',
       'Miscellaneous Goods and Services Expenditure',
       'Special Occasions Expenditure', 'Crop Farming and Gardening expenses',
       'Total Income from Entrepreneurial Acitivites', 'Household Head Sex',
       'Household Head Age', 'Household Head Marital Status',
       'Household Head Highest Grade Completed',
       'Ho