In [1]:
import pandas as pd
import numpy as np

In [2]:
train_values = pd.read_csv('../data/raw/training_values.csv')
train_labels = pd.read_csv('../data/raw/training_labels.csv')
train_df = train_values.merge(train_labels, on='id')

test_values = pd.read_csv('../data/raw/test_values.csv')

Many features have illogical values (e.g. '0' as the name of the funder).  I am replacing these illogical values with nans, as the values were likely entered as a placeholder for 'unknown'.

In [3]:
clean_train = train_df
clean_test = test_values

for df in [clean_train, clean_test]:
    df['funder'].replace('0', np.nan, inplace=True)
    df['installer'].replace('0', np.nan, inplace=True)
    df['longitude'].replace(0, np.nan, inplace=True)
    df['latitude'].replace(-0.00000002, np.nan, inplace=True)
    df['population'].replace(0, np.nan, inplace=True)
    df['construction_year'].replace(0, np.nan, inplace=True)

Some of the test features contain different conventions for capitalization (e.g. District Council vs. District council).  Thus, I am making all text values lowercase to ensure entries are consistent.

In [4]:
for df in [clean_train, clean_test]:
    for col in df:
        if df[col].dtype == 'O' and col not in ['permit', 'public_meeting']:
            df[col] = df[col].str.lower()

Making the date recorded datetime, so that alogrithms can split on before/after a certain date.

In [5]:
clean_train['date_recorded'] = pd.to_datetime(clean_train['date_recorded'])
clean_test['date_recorded'] = pd.to_datetime(clean_test['date_recorded'])

Making new column that aggregates all other text columns.  This will be passed into a naive bayes classifier to be used as a feature later on.  The goal of using this technique is to capture some of the signal held by these columns without making dummies for all of them.  Not all columns are included, as some seem to add no new information.  The recorded_by column has the same value for every entry, thus is adding no information.

In [6]:
clean_train['text_cols'] = clean_train[['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type']].values.tolist()
clean_train.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code'], axis=1, inplace=True)

clean_test['text_cols'] = clean_test[['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type']].values.tolist()
clean_test.drop(['funder', 'installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 'scheme_management', 'scheme_name', 'extraction_type', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'waterpoint_type', 'recorded_by', 'extraction_type_group', 'extraction_type_class', 'quantity_group', 'source_class', 'waterpoint_type_group', 'region_code', 'district_code'], axis=1, inplace=True)

In [7]:
clean_train['text_cols'] = clean_train['text_cols'].map(lambda x: ' '.join(str(word) for word in x))
clean_test['text_cols'] = clean_test['text_cols'].map(lambda x: ' '.join(str(word) for word in x))

All columns other than 'text_cols' are now numerical.  The following is predicting the class of the well based on the text columns using naive bayes.

In [8]:
from sklearn.model_selection import train_test_split

naive_y = clean_train['status_group'].values
naive_X = clean_train['text_cols'].values
naive_X_train, naive_X_test, naive_y_train, naive_y_test = train_test_split(naive_X, naive_y)

In [13]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def tokenize_snow(doc):
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in word_tokenize(doc.lower())]

def tokenize_port(doc):
    porter = PorterStemmer()
    return [porter.stem(word) for word in word_tokenize(doc.lower())]

def tokenize_wordnet(doc):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in word_tokenize(doc.lower())]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

def make_model(tokenize_func):
    vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize_func)
    tfidf_vectorized = vectorizer.fit_transform(naive_X_train)
    model = MultinomialNB()
    model.fit(tfidf_vectorized, naive_y_train)
    return model, vectorizer

def compare_models(list_models, list_vectorizers):
    i = 1
    for model, vect in zip(list_models, list_vectorizers):
        preds = model.predict(vect.transform(naive_X_test))
        print 'Model {}\'s Accuracy: {}'.format(i, accuracy_score(naive_y_test, preds))
        i += 1

model_snow, vect_snow = make_model(tokenize_snow)
model_port, vect_port = make_model(tokenize_port)
model_wordnet, vect_wordnet = make_model(tokenize_wordnet)

models = [model_snow, model_port, model_wordnet]
vectorizers = [vect_snow, vect_port, vect_wordnet]
compare_models(models, vectorizers)