In [1]:
import pandas as pd

# Playground

## Data

In [2]:
train = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/trainLabels.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe


In [4]:
train_labels.head()

Unnamed: 0,id,status_group
0,69572,functional
1,8776,functional
2,34310,functional
3,67743,non functional
4,19728,functional


## Combined train data

In [5]:
comb = pd.merge(train, train_labels, on='id')

In [6]:
comb = comb.drop('id', axis=1)

In [7]:
comb.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


## Feature Engineering

In [8]:
comb.columns

Index(['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer',
       'longitude', 'latitude', 'wpt_name', 'num_private', 'basin',
       'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward',
       'population', 'public_meeting', 'recorded_by', 'scheme_management',
       'scheme_name', 'permit', 'construction_year', 'extraction_type',
       'extraction_type_group', 'extraction_type_class', 'management',
       'management_group', 'payment', 'payment_type', 'water_quality',
       'quality_group', 'quantity', 'quantity_group', 'source', 'source_type',
       'source_class', 'waterpoint_type', 'waterpoint_type_group',
       'status_group'],
      dtype='object')

## Date to diff between now

In [9]:
def date_to_diff(x):
    x = int(x.split('-')[0])
    return 2020 - x

In [10]:
comb['years'] = comb['date_recorded'].apply(date_to_diff)

In [11]:
comb = comb.drop('date_recorded', axis=1)

## Tests

In [12]:
sum(comb['num_private'] == 0)

58643

In [13]:
comb['subvillage']

0            Mnyusi B
1             Nyamara
2             Majengo
3          Mahakamani
4          Kyanyamisa
             ...     
59395        Kiduruni
59396        Igumbilo
59397       Madungulu
59398          Mwinyi
59399    Kikatanyemba
Name: subvillage, Length: 59400, dtype: object

In [14]:
comb['region']

0             Iringa
1               Mara
2            Manyara
3             Mtwara
4             Kagera
            ...     
59395    Kilimanjaro
59396         Iringa
59397          Mbeya
59398         Dodoma
59399       Morogoro
Name: region, Length: 59400, dtype: object

## Drop useless

In [15]:
dropp = [
    'region_code', 'district_code', 'lga', 'ward', 'gps_height', 'num_private',
    'recorded_by', 'scheme_name', 'extraction_type_group', 'scheme_management',
    'payment', 'extraction_type_class', 'quantity', 'waterpoint_type',
    'wpt_name', 'source_type'
]

In [17]:
comb = comb.drop(dropp, axis=1)

In [19]:
comb.head()

Unnamed: 0,amount_tsh,funder,installer,longitude,latitude,basin,subvillage,region,population,public_meeting,...,management_group,payment_type,water_quality,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group,years
0,6000.0,Roman,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,109,True,...,user-group,annually,soft,good,enough,spring,groundwater,communal standpipe,functional,9
1,0.0,Grumeti,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,280,,...,user-group,never pay,soft,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,7
2,25.0,Lottery Club,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,250,True,...,user-group,per bucket,soft,good,enough,dam,surface,communal standpipe,functional,7
3,0.0,Unicef,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,58,True,...,user-group,never pay,soft,good,dry,machine dbh,groundwater,communal standpipe,non functional,7
4,0.0,Action In A,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,0,True,...,other,never pay,soft,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,9


# Main

In [2]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:
pd.set_option('display.max_columns', 500)

# DL

In [11]:
import numpy as np

## Train

In [97]:
data_m = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/trainLabels.csv')

In [98]:
data_m = pd.merge(data_m, train_labels, on='id')
data_m = data_m.drop('id', axis=1)

In [99]:
data_m.head()

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,Lake Nyasa,Mnyusi B,Iringa,11,5,Ludewa,Mundindi,109,True,GeoData Consultants Ltd,VWC,Roman,False,1999,gravity,gravity,gravity,vwc,user-group,pay annually,annually,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,Lake Victoria,Nyamara,Mara,20,2,Serengeti,Natta,280,,GeoData Consultants Ltd,Other,,True,2010,gravity,gravity,gravity,wug,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,Pangani,Majengo,Manyara,21,4,Simanjiro,Ngorika,250,True,GeoData Consultants Ltd,VWC,Nyumba ya mungu pipe scheme,True,2009,gravity,gravity,gravity,vwc,user-group,pay per bucket,per bucket,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,Ruvuma / Southern Coast,Mahakamani,Mtwara,90,63,Nanyumbu,Nanyumbu,58,True,GeoData Consultants Ltd,VWC,,True,1986,submersible,submersible,submersible,vwc,user-group,never pay,never pay,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,Lake Victoria,Kyanyamisa,Kagera,18,1,Karagwe,Nyakasimbi,0,True,GeoData Consultants Ltd,,,True,0,gravity,gravity,gravity,other,other,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


In [117]:
dropp = [
    'region_code', 'district_code', 'lga', 'ward', 'gps_height', 'num_private',
    'recorded_by', 'scheme_name', 'extraction_type_group', 'scheme_management',
    'payment', 'extraction_type_class', 'quantity', 'waterpoint_type',
    'wpt_name', 'source_type'
]
cat_names = [
    'funder', 'installer', 'basin', 'subvillage', 'region', 'public_meeting',
    'management', 'permit', 'extraction_type', 'source', 'management_group',
    'payment_type', 'water_quality', 'source_class', 'status_group','quality_group', 'quantity_group', 'waterpoint_type_group','publicpermit']
cont_names = ['amount_tsh', 'population', 'years_cons', 'years']

In [101]:
def date_to_diff(x):
    x = int(x.split('-')[0])
    if x == 0:
        return 0
    else:
        return 2020 - x


def date_to_diff2(x):
    if x == 0:
        return 0
    else:
        return 2020 - x


def preproc_train(comb, dropp):
    comb['years_cons'] = comb['construction_year'].apply(date_to_diff2)
    comb = comb.drop('construction_year', axis=1)
    comb = comb.drop(dropp, axis=1)
    comb['years'] = comb['date_recorded'].apply(date_to_diff)
    comb = comb.drop('date_recorded', axis=1)
    comb.replace('#na#', np.nan)
    return comb

In [102]:
valid_idx = list(range(300, 400))

In [103]:
data_m = preproc_train(data_m, dropp)
data_m.head()

Unnamed: 0,amount_tsh,funder,installer,longitude,latitude,basin,subvillage,region,population,public_meeting,permit,extraction_type,management,management_group,payment_type,water_quality,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group,years_cons,years
0,6000.0,Roman,Roman,34.938093,-9.856322,Lake Nyasa,Mnyusi B,Iringa,109,True,False,gravity,vwc,user-group,annually,soft,good,enough,spring,groundwater,communal standpipe,functional,21,9
1,0.0,Grumeti,GRUMETI,34.698766,-2.147466,Lake Victoria,Nyamara,Mara,280,,True,gravity,wug,user-group,never pay,soft,good,insufficient,rainwater harvesting,surface,communal standpipe,functional,10,7
2,25.0,Lottery Club,World vision,37.460664,-3.821329,Pangani,Majengo,Manyara,250,True,True,gravity,vwc,user-group,per bucket,soft,good,enough,dam,surface,communal standpipe,functional,11,7
3,0.0,Unicef,UNICEF,38.486161,-11.155298,Ruvuma / Southern Coast,Mahakamani,Mtwara,58,True,True,submersible,vwc,user-group,never pay,soft,good,dry,machine dbh,groundwater,communal standpipe,non functional,34,7
4,0.0,Action In A,Artisan,31.130847,-1.825359,Lake Victoria,Kyanyamisa,Kagera,0,True,True,gravity,other,other,never pay,soft,good,seasonal,rainwater harvesting,surface,communal standpipe,functional,0,9


In [104]:
data_m = data_m.replace(np.nan, '')

In [105]:
data_m['public_meeting'] = data_m['public_meeting'].astype(bool)
data_m['permit'] = data_m['permit'].astype(bool)

In [106]:
data_m['publicpermit'] = data_m['public_meeting'] & data_m['permit']

In [111]:
for a in data_m['payment_type'].unique():
    print(a)

annually
never pay
per bucket
unknown
on failure
other
monthly


In [118]:
for a in cat_names:
    data_m[a] =data_m[a].astype('category').cat.codes 

In [119]:
data_m.head()


Unnamed: 0,amount_tsh,funder,installer,longitude,latitude,basin,subvillage,region,population,public_meeting,permit,extraction_type,management,management_group,payment_type,water_quality,quality_group,quantity_group,source,source_class,waterpoint_type_group,status_group,years_cons,years,publicpermit
0,6000.0,1370,1519,34.938093,-9.856322,1,11808,3,109,1,0,3,7,4,0,6,2,1,8,0,1,0,21,9,0
1,0.0,470,546,34.698766,-2.147466,4,15839,9,280,0,1,3,11,4,2,6,2,2,5,1,1,0,10,7,0
2,25.0,826,2049,37.460664,-3.821329,5,9075,8,250,1,1,3,7,4,5,6,2,1,0,1,1,0,11,7,1
3,0.0,1742,1853,38.486161,-11.155298,7,8983,12,58,1,1,14,7,4,2,6,2,0,3,0,1,2,34,7,1
4,0.0,21,120,31.130847,-1.825359,4,7699,4,0,1,1,3,1,1,2,6,2,3,5,1,1,0,0,9,1


In [179]:
data_m.columns

Index(['amount_tsh', 'funder', 'installer', 'longitude', 'latitude', 'basin',
       'subvillage', 'region', 'population', 'public_meeting', 'permit',
       'extraction_type', 'management', 'management_group', 'payment_type',
       'water_quality', 'quality_group', 'quantity_group', 'source',
       'source_class', 'waterpoint_type_group', 'status_group', 'years_cons',
       'years', 'publicpermit'],
      dtype='object')

In [80]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [120]:
X_train, X_test, Y_train, Y_test = train_test_split(data_m.drop('status_group', axis=1),
                                                    data_m['status_group'],
                                                    test_size=0.2)

## DL

In [112]:
import tensorflow as tf
from tensorflow.keras import metrics
from tensorflow.keras import regularizers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.python.keras.callbacks import TensorBoard
from keras.utils import plot_model
import datetime, os

Using TensorFlow backend.


In [163]:
X_train.shape,Y_train.shape

((47520, 24), (47520,))

In [166]:
t_model = Sequential()
t_model.add(Dense(100, activation="relu",input_shape=(24,)))
t_model.add(Dropout(.2))
t_model.add(Dense(50,activation='relu'))
t_model.add(Dense(1))
t_model.compile(
    loss="mean_squared_error",
    optimizer=Adam(lr=0.001),
    metrics=[metrics.mae])

In [167]:
epochs = 50
batch = 128


history = t_model.fit(
    x = X_train.values,y =  Y_train.values,
    batch_size=batch,
    epochs=epochs,
    shuffle=True,
    verbose=1,
    validation_data=(X_test.values,Y_test.values),
)


Train on 47520 samples, validate on 11880 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50


## RandomForest

In [21]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier

In [93]:
clf = RandomForestClassifier (verbose=10)

In [94]:
clf.fit(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 100
building tree 2 of 100
building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s


building tree 4 of 100
building tree 5 of 100
building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.5s remaining:    0.0s


building tree 7 of 100
building tree 8 of 100
building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.6s remaining:    0.0s


building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    8.3s finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=10, warm_start=False)

In [95]:
clf.score(X_test,Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.2s finished


0.8005892255892256

In [170]:
data_2 = pd.read_csv('data/test.csv')

In [171]:
data_2.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,basin,subvillage,region,region_code,district_code,lga,ward,population,public_meeting,recorded_by,scheme_management,scheme_name,permit,construction_year,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group
0,50785,0.0,2013-02-04,Dmdd,1996,DMDD,35.290799,-4.059696,Dinamu Secondary School,0,Internal,Magoma,Manyara,21,3,Mbulu,Bashay,321,True,GeoData Consultants Ltd,Parastatal,,True,2012,other,other,other,parastatal,parastatal,never pay,never pay,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,other,other
1,51630,0.0,2013-02-04,Government Of Tanzania,1569,DWE,36.656709,-3.309214,Kimnyak,0,Pangani,Kimnyak,Arusha,2,2,Arusha Rural,Kimnyaki,300,True,GeoData Consultants Ltd,VWC,TPRI pipe line,True,2000,gravity,gravity,gravity,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,spring,spring,groundwater,communal standpipe,communal standpipe
2,17168,0.0,2013-02-01,,1567,,34.767863,-5.004344,Puma Secondary,0,Internal,Msatu,Singida,13,2,Singida Rural,Puma,500,True,GeoData Consultants Ltd,VWC,P,,2010,other,other,other,vwc,user-group,never pay,never pay,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,other,other
3,45559,0.0,2013-01-22,Finn Water,267,FINN WATER,38.058046,-9.418672,Kwa Mzee Pange,0,Ruvuma / Southern Coast,Kipindimbi,Lindi,80,43,Liwale,Mkutano,250,,GeoData Consultants Ltd,VWC,,True,1987,other,other,other,vwc,user-group,unknown,unknown,soft,good,dry,dry,shallow well,shallow well,groundwater,other,other
4,49871,500.0,2013-03-27,Bruder,1260,BRUDER,35.006123,-10.950412,Kwa Mzee Turuka,0,Ruvuma / Southern Coast,Losonga,Ruvuma,10,3,Mbinga,Mbinga Urban,60,,GeoData Consultants Ltd,Water Board,BRUDER,True,2000,gravity,gravity,gravity,water board,user-group,pay monthly,monthly,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe


In [172]:
def date_to_diff(x):
    x = int(x.split('-')[0])
    if x == 0:
        return 0
    else:
        return 2020 - x


def date_to_diff2(x):
    if x == 0:
        return 0
    else:
        return 2020 - x


def preproc_train(comb, dropp):
    comb['years_cons'] = comb['construction_year'].apply(date_to_diff2)
    comb = comb.drop('construction_year', axis=1)
    comb = comb.drop(dropp, axis=1)
    comb['years'] = comb['date_recorded'].apply(date_to_diff)
    comb = comb.drop('date_recorded', axis=1)
    comb.replace('#na#', np.nan)
    return comb

In [173]:
data_2 = preproc_train(data_2, dropp)
data_2 = data_2.replace(np.nan, '')
data_2['public_meeting'] = data_2['public_meeting'].astype(bool)
data_2['permit'] = data_2['permit'].astype(bool)
data_2['publicpermit'] = data_2['public_meeting'] & data_2['permit']
data_2.head()

Unnamed: 0,id,amount_tsh,funder,installer,longitude,latitude,basin,subvillage,region,population,public_meeting,permit,extraction_type,management,management_group,payment_type,water_quality,quality_group,quantity_group,source,source_class,waterpoint_type_group,years_cons,years,publicpermit
0,50785,0.0,Dmdd,DMDD,35.290799,-4.059696,Internal,Magoma,Manyara,321,True,True,other,parastatal,parastatal,never pay,soft,good,seasonal,rainwater harvesting,surface,other,8,7,True
1,51630,0.0,Government Of Tanzania,DWE,36.656709,-3.309214,Pangani,Kimnyak,Arusha,300,True,True,gravity,vwc,user-group,never pay,soft,good,insufficient,spring,groundwater,communal standpipe,20,7,True
2,17168,0.0,,,34.767863,-5.004344,Internal,Msatu,Singida,500,True,False,other,vwc,user-group,never pay,soft,good,insufficient,rainwater harvesting,surface,other,10,7,False
3,45559,0.0,Finn Water,FINN WATER,38.058046,-9.418672,Ruvuma / Southern Coast,Kipindimbi,Lindi,250,False,True,other,vwc,user-group,unknown,soft,good,dry,shallow well,groundwater,other,33,7,False
4,49871,500.0,Bruder,BRUDER,35.006123,-10.950412,Ruvuma / Southern Coast,Losonga,Ruvuma,60,False,True,gravity,water board,user-group,monthly,soft,good,enough,spring,groundwater,communal standpipe,20,7,False


In [174]:
for a in cat_names:
    try:
        data_2[a] =data_2[a].astype('category').cat.codes 
    except:
        pass

In [183]:
data_2.head()

Unnamed: 0,id,amount_tsh,funder,installer,longitude,latitude,basin,subvillage,region,population,public_meeting,permit,extraction_type,management,management_group,payment_type,water_quality,quality_group,quantity_group,source,source_class,waterpoint_type_group,years_cons,years,publicpermit,status_group
0,50785,0.0,176,215,35.290799,-4.059696,0,3917,8,321,1,1,9,3,2,2,6,2,3,5,1,5,8,7,1,0.832171
1,51630,0.0,251,222,36.656709,-3.309214,5,2718,0,300,1,1,3,7,4,2,6,2,2,8,0,1,20,7,1,0.832171
2,17168,0.0,0,0,34.767863,-5.004344,0,5399,18,500,1,0,9,7,4,2,6,2,2,5,1,5,10,7,0,0.832171
3,45559,0.0,223,263,38.058046,-9.418672,7,2796,7,250,0,1,9,7,4,6,6,2,0,7,0,5,33,7,0,0.832171
4,49871,500.0,74,76,35.006123,-10.950412,7,3545,16,60,0,1,3,9,4,1,6,2,1,8,0,1,20,7,0,0.832171


In [182]:
data_2.columns

Index(['id', 'amount_tsh', 'funder', 'installer', 'longitude', 'latitude',
       'basin', 'subvillage', 'region', 'population', 'public_meeting',
       'permit', 'extraction_type', 'management', 'management_group',
       'payment_type', 'water_quality', 'quality_group', 'quantity_group',
       'source', 'source_class', 'waterpoint_type_group', 'years_cons',
       'years', 'publicpermit', 'status_group'],
      dtype='object')

In [None]:
Index(['amount_tsh', 'funder', 'installer', 'longitude', 'latitude', 'basin',
       'subvillage', 'region', 'population', 'public_meeting', 'permit',
       'extraction_type', 'management', 'management_group', 'payment_type',
       'water_quality', 'quality_group', 'quantity_group', 'source',
       'source_class', 'waterpoint_type_group', 'status_group', 'years_cons',
       'years', 'publicpermit'],
      dtype='object')

In [184]:
data_2['status_group']= model.predict(data_2.drop(['id'],axis=1))

NameError: name 'model' is not defined

In [89]:
temp_result = data_2[['id','status_group']]

In [90]:
temp_result.head()

Unnamed: 0,id,status_group
0,50785,2
1,51630,0
2,17168,0
3,45559,2
4,49871,0


In [92]:
d_replace = {0:'functional', 2:'non functional',1:'functional needs repair'}

In [93]:
def ret_cod(x):
    return d_replace[x]

In [95]:
temp_result['status_group'] = temp_result['status_group'].apply(ret_cod)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [98]:
temp_result.to_csv('Submission1.csv',index=False)