In [None]:
import pandas as pd

In [None]:
%%capture

import sys

# If you're on Colab:
if 'google.colab' in sys.modules:
    DATA_PATH = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Kaggle-Challenge/master/data/'
    !pip install category_encoders==2.*
    !pip install pandas-profiling==2.*

# If you're working locally:
else:
    DATA_PATH = '../data/'

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Merge train_features.csv & train_labels.csv
train = pd.merge(pd.read_csv(DATA_PATH+'waterpumps/train_features.csv'),
                 pd.read_csv(DATA_PATH+'waterpumps/train_labels.csv'))

# Read test_features.csv & sample_submission.csv
test = pd.read_csv(DATA_PATH+'waterpumps/test_features.csv')
sample_submission = pd.read_csv(DATA_PATH+'waterpumps/sample_submission.csv')

# Split train into train & val
train, val = train_test_split(train, train_size=0.8, test_size=0.20,
                              stratify=train['status_group'], random_state=42)


def wrangle(X):
    """Wrangle train, validate, and test sets in the same way"""

    # Prevent SettingWithCopyWarning
    X = X.copy()


    X['latitude'] = X['latitude'].replace(-2e-08, 0)


    cols_with_zeros = ['longitude', 'latitude', 'construction_year',
                       'gps_height', 'population']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col+'_MISSING'] = X[col].isnull()

    # Drop duplicate columns
    duplicates = ['quantity_group', 'payment_type']
    X = X.drop(columns=duplicates)

    # Drop recorded_by (never varies) and id (always varies, random)
    unusable_variance = ['recorded_by','id','wpt_name','quality_group','region_code']
    X = X.drop(columns=unusable_variance)

    # return the wrangled dataframe
    X['district_code'] = X['district_code'].astype(object)

    return X



train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

In [None]:
pd.crosstab(train.subvillage, train.region)

In [None]:


val['status_group'] = val['status_group'].apply(label_map)

In [None]:
# 'recorded_by','id','wpt_name','date_recorded','funder', 'installer', 'basin', 'subvillage', 'region', 'lga', 'ward', 'public_meeting',
#                          'scheme_management', 'scheme_name', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management',
#                          'management_group', 'payment', 'water_quality', 'quality_group', 'quantity', 'source', 'source_type', 'source_class', 'waterpoint_type',
#                          'waterpoint_type_group', 'longitude_MISSING', 'latitude_MISSING', 'construction_year_MISSING', 'gps_height_MISSING', 'population_MISSING']

In [None]:
train.dtypes

amount_tsh                   float64
date_recorded                 object
funder                        object
gps_height                   float64
installer                     object
longitude                    float64
latitude                     float64
num_private                    int64
basin                         object
subvillage                    object
region                        object
region_code                    int64
district_code                  int64
lga                           object
ward                          object
population                   float64
public_meeting                object
scheme_management             object
scheme_name                   object
permit                        object
construction_year            float64
extraction_type               object
extraction_type_group         object
extraction_type_class         object
management                    object
management_group              object
payment                       object
w

In [None]:
train['quality_group'].value_counts()

good        40598
salty        4179
unknown      1503
milky         658
colored       403
fluoride      179
Name: quality_group, dtype: int64

In [None]:
train['status_group'] = train['status_group'].apply(label_map)

In [None]:
# The status_group column is the target
target = 'status_group'

# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 50].index.tolist()

# Combine the lists
features = numeric_features + categorical_features
cardinality

date_recorded                  349
funder                        1716
installer                     1929
basin                            9
subvillage                   17231
region                          21
district_code                   20
lga                            124
ward                          2082
public_meeting                   2
scheme_management               12
scheme_name                   2563
permit                           2
extraction_type                 18
extraction_type_group           13
extraction_type_class            7
management                      12
management_group                 5
payment                          7
water_quality                    8
quantity                         5
source                          10
source_type                      7
source_class                     3
waterpoint_type                  7
waterpoint_type_group            6
longitude_MISSING                2
latitude_MISSING                 2
construction_year_MI

In [None]:
# Arrange data into X features matrix and y target vector
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [None]:
# TODO

# Select K-Best results in a decrease in accuracy over the vanilla RandomForest.
%%time
import category_encoders as ce
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest

# Are selection methods like selectkbest necessary with tree based models? \\Doesnt the model do a good job itself of determining what features are relevant or not?
pipeline = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    SelectKBest(k='all'),
    RandomForestClassifier(random_state=55, n_jobs=-1)
)

pipeline.fit(X_train, y_train)
print ('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8053030303030303
CPU times: user 14.2 s, sys: 256 ms, total: 14.4 s
Wall time: 8.02 s


In [None]:
# TODO
X_train = train.drop(columns=target)
y_train = train[target]
X_val = val.drop(columns=target)
y_val = val[target]
X_test = test

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
pipeline = make_pipeline(ce.OrdinalEncoder(), SimpleImputer(strategy='mean'), StandardScaler(),
    RandomForestClassifier(random_state=66))


pipeline.fit(X_train, y_train)
print ('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8088383838383838


In [None]:
submission

In [None]:
submission.to_csv('50Forest.csv', index=False)

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
submission = sample_submission.copy()
submission['status_group']=y_pred
submission.to_csv('jeremy_submission_04.csv', index=False)

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import IncrementalPCA
pipeline = make_pipeline(ce.OrdinalEncoder(), SimpleImputer(strategy='mean'), StandardScaler(),
    RandomForestClassifier(n_estimators=500,random_state=55))


pipeline.fit(X_train, y_train)
print ('Validation Accuracy', pipeline.score(X_val, y_val))

Validation Accuracy 0.8107744107744108


In [None]:
y_pred = pipeline.predict(X_test)
submission = sample_submission.copy()
submission['status_group']=y_pred
submission.to_csv('COORDS_jeremy_submission_01.csv', index=False)