In [362]:
import pandas as pd
import numpy as np
np.random.seed(42)

In [363]:
train_values = pd.read_csv('../data/raw/training_values.csv')
train_labels = pd.read_csv('../data/raw/training_labels.csv')
train_df = train_values.merge(train_labels, on='id')

test_values = pd.read_csv('../data/raw/test_values.csv')

Many features have illogical values (e.g. '0' as the name of the funder).  I am replacing these illogical values with nans, as the values were likely entered as a placeholder for 'unknown'.

In [364]:
clean_train = train_df
clean_test = test_values

for df in [clean_train, clean_test]:
    df['funder'].replace('0', np.nan, inplace=True)
    df['installer'].replace('0', np.nan, inplace=True)
    df['longitude'].replace(0, np.nan, inplace=True)
    df['latitude'].replace(-0.00000002, np.nan, inplace=True)
    df['population'].replace(0, np.nan, inplace=True)
    df['construction_year'].replace(0, np.nan, inplace=True)

Some of the test features contain different conventions for capitalization (e.g. District Council vs. District council).  Thus, I am making all text values lowercase to ensure entries are consistent.

In [365]:
for df in [clean_train, clean_test]:
    for col in df:
        if df[col].dtype == 'O' and col not in ['permit', 'public_meeting']:
            df[col] = df[col].str.lower()

Making the date recorded datetime, so that alogrithms can split on before/after a certain date.

In [366]:
clean_train['date_recorded'] = pd.to_datetime(clean_train['date_recorded'])
clean_test['date_recorded'] = pd.to_datetime(clean_test['date_recorded'])

In [367]:
clean_train['public_meeting'].fillna(value=True, inplace=True)
clean_test['public_meeting'].fillna(value=True, inplace=True)

In [368]:
clean_train['permit'].fillna(value=True, inplace=True)
clean_test['permit'].fillna(value=True, inplace=True)

In [369]:
clean_train['longitude'].fillna(value=clean_train['longitude'].mean(), inplace=True)
clean_test['longitude'].fillna(value=clean_test['longitude'].mean(), inplace=True)
clean_train['latitude'].fillna(value=clean_train['latitude'].mean(), inplace=True)
clean_test['latitude'].fillna(value=clean_test['latitude'].mean(), inplace=True)

In [370]:
dummies_train = pd.get_dummies(clean_train[['basin', 'extraction_type', 'management']])
clean_constr_train = pd.concat([clean_train, dummies_train], axis=1)
clean_constr_train.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code', 'status_group', 'id', 'population'], axis=1, inplace=True)
dummies_test = pd.get_dummies(clean_test[['basin', 'extraction_type', 'management']])
clean_constr_test = pd.concat([clean_test, dummies_test], axis=1)
clean_constr_test.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code', 'id', 'population'], axis=1, inplace=True)

constr_y_train = clean_constr_train[~clean_constr_train['construction_year'].isnull()]['construction_year']
constr_y_test = clean_constr_test[~clean_constr_test['construction_year'].isnull()]['construction_year']
constr_y_total = pd.concat([constr_y_train, constr_y_test], axis=0, ignore_index=True).values

constr_X_train = clean_constr_train[~clean_constr_train['construction_year'].isnull()][['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']]
constr_X_test = clean_constr_test[~clean_constr_test['construction_year'].isnull()][['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']]
constr_X_total = pd.concat([constr_X_train, constr_X_test], axis=0, ignore_index=True).values

constr_X_train, constr_X_test, constr_y_train, constr_y_test = train_test_split(constr_X_total, constr_y_total)

In [371]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_constr = RandomForestRegressor()
model_constr.fit(constr_X_train, constr_y_train)
mse = mean_squared_error(constr_y_test, model_constr.predict(constr_X_test))
print np.sqrt(mse)

prediction_df_train = clean_train[clean_train['construction_year'].isnull()]
train_preds = model_constr.predict(prediction_df_train[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']].values)
clean_train.ix[clean_train['construction_year'].isnull(), 'construction_year'] = train_preds

prediction_df_test = clean_test[clean_test['construction_year'].isnull()]
test_preds = model_constr.predict(prediction_df_test[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']].values)
clean_test.ix[clean_test['construction_year'].isnull(), 'construction_year'] = test_preds

8.44304013278


Making new column that aggregates all other text columns.  This will be passed into a naive bayes classifier to be used as a feature later on.  The goal of using this technique is to capture some of the signal held by these columns without making dummies for all of them.  Not all columns are included, as some seem to add no new information.  The recorded_by column has the same value for every entry, thus is adding no information.

In [372]:
clean_train['text_cols'] = clean_train[['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type']].values.tolist()
clean_train.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code'], axis=1, inplace=True)

clean_test['text_cols'] = clean_test[['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type']].values.tolist()
clean_test.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code'], axis=1, inplace=True)

In [373]:
clean_train['text_cols'] = clean_train['text_cols'].map(lambda x: ' '.join(str(word) for word in x))
clean_test['text_cols'] = clean_test['text_cols'].map(lambda x: ' '.join(str(word) for word in x))

All columns other than 'text_cols' are now numerical.  The following is predicting the class of the well based on the text columns using naive bayes.

In [374]:
from sklearn.model_selection import train_test_split

naive_y = clean_train['status_group'].values
naive_X = clean_train['text_cols'].values
naive_X_train, naive_X_test, naive_y_train, naive_y_test = train_test_split(naive_X, naive_y, stratify=naive_y)

In [375]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def tokenize_snow(doc):
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in word_tokenize(doc.lower())]

def tokenize_port(doc):
    porter = PorterStemmer()
    return [porter.stem(word) for word in word_tokenize(doc.lower())]

def tokenize_wordnet(doc):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in word_tokenize(doc.lower())]

In [376]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def make_model(tokenize_func):
    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_func)
    tfidf_vectorized = vectorizer.fit_transform(naive_X_train)
    model = MultinomialNB()
    model.fit(tfidf_vectorized, naive_y_train)
    return model, vectorizer

In [377]:
model_snow, vect_snow = make_model(tokenize_snow)
print model_snow.score(vect_snow.transform(naive_X_test), naive_y_test)

0.695555555556


In [378]:
# model_port, vect_port = make_model(tokenize_port)
# print model_port.score(vect_port.transform(naive_X_test), naive_y_test)

In [379]:
# model_wordnet, vect_wordnet = make_model(tokenize_wordnet)
# print model_wordnet.score(vect_wordnet.transform(naive_X_test), naive_y_test)

In [380]:
clean_train['naive_bayes'] = model_snow.predict(vect_snow.transform(naive_X))
clean_test['naive_bayes'] = model_snow.predict(vect_snow.transform(clean_test['text_cols'].values))

In [381]:
clean_train.drop('text_cols', axis=1, inplace=True)
clean_test.drop('text_cols', axis=1, inplace=True)
naive_dummies_train = pd.get_dummies(clean_train['naive_bayes'], drop_first=True)
naive_dummies_test = pd.get_dummies(clean_test['naive_bayes'], drop_first=True)

clean_train = pd.concat([clean_train, naive_dummies_train], axis=1)
clean_train.drop('naive_bayes', axis=1, inplace=True)
clean_test = pd.concat([clean_test, naive_dummies_test], axis=1)
clean_test.drop('naive_bayes', axis=1, inplace=True)

In [382]:
# clean_train['longitude'].fillna(value=clean_train['longitude'].mean(), inplace=True)
# clean_test['longitude'].fillna(value=clean_test['longitude'].mean(), inplace=True)
# clean_train['latitude'].fillna(value=clean_train['latitude'].mean(), inplace=True)
# clean_test['latitude'].fillna(value=clean_test['latitude'].mean(), inplace=True)

I believe that population and construction year could be important features, so I am hesitant to drop them due to nans.  They also are quite variable, so I don't think it's a good idea to fill with mean or mode.  Thus, I am constructing K-nearest neighbors regressions for both of the columns and using it to fill the nans.

In [383]:
# constr_y_train = clean_train[~clean_train['construction_year'].isnull()]['construction_year']
# constr_y_test = clean_test[~clean_test['construction_year'].isnull()]['construction_year']
# constr_y_total = pd.concat([constr_y_train, constr_y_test], axis=0, ignore_index=True).values

# constr_X_train = clean_train[~clean_train['construction_year'].isnull()][['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']]
# constr_X_test = clean_test[~clean_test['construction_year'].isnull()][['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']]
# constr_X_total = pd.concat([constr_X_train, constr_X_test], axis=0, ignore_index=True).values

# constr_X_train, constr_X_test, constr_y_train, constr_y_test = train_test_split(constr_X_total, constr_y_total)

In [384]:
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.metrics import mean_squared_error

# model_constr = KNeighborsRegressor()
# model_constr.fit(constr_X_train, constr_y_train)
# mse = mean_squared_error(constr_y_test, model_constr.predict(constr_X_test))
# print np.sqrt(mse)

# prediction_df_train = clean_train[clean_train['construction_year'].isnull()]
# train_preds = model_constr.predict(prediction_df_train[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']].values)
# clean_train.ix[clean_train['construction_year'].isnull(), 'construction_year'] = train_preds

# prediction_df_test = clean_test[clean_test['construction_year'].isnull()]
# test_preds = model_constr.predict(prediction_df_test[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'public_meeting', 'permit']].values)
# clean_test.ix[clean_test['construction_year'].isnull(), 'construction_year'] = test_preds

# clean_train.info()
# clean_test.info()

In [385]:
clean_train.drop(['population'], axis=1, inplace=True)
clean_test.drop(['population'], axis=1, inplace=True)

In [386]:
clean_train.to_csv('../data/processed/clean_train_df.csv')
clean_test.to_csv('../data/processed/clean_test_df.csv')