<a href="https://colab.research.google.com/github/ewuerfel66/DS-Unit-2-Sprint-2-Regression/blob/master/TanzanianWaterPumps_EricWuerfel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tanzania Water Pumps

## Imports

In [0]:
# !pip install -U pandas-profiling
!pip install category-encoders

Collecting category-encoders
[?25l  Downloading https://files.pythonhosted.org/packages/6e/a1/f7a22f144f33be78afeb06bfa78478e8284a64263a3c09b1ef54e673841e/category_encoders-2.0.0-py2.py3-none-any.whl (87kB)
[K     |████████████████████████████████| 92kB 4.6MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.0.0


In [0]:
# libraries
import pandas as pd
import pandas_profiling
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import category_encoders as ce

In [0]:
# Data
LOCAL = '../data/tanzania/'
WEB = 'https://raw.githubusercontent.com/LambdaSchool/DS-Unit-2-Linear-Models/master/data/tanzania/'

train_features = pd.read_csv(WEB + 'train_features.csv')
train_labels = pd.read_csv(WEB + 'train_labels.csv')
test_features = pd.read_csv(WEB + 'test_features.csv')
sample_submission = pd.read_csv(WEB + 'sample_submission.csv')

assert train_features.shape == (59400, 40)
assert train_labels.shape == (59400, 2)
assert test_features.shape == (14358, 40)
assert sample_submission.shape == (14358, 2)

## Data Exploration & Cleaning

In [0]:
def wrangle(X):
    """Wrangles train, validate, and test sets in the same way"""
    X = X.copy()
    
    # About 3% of the time, latitude has small values near zero,
    # outside Tanzania, so we'll treat these values like zero.
    X['latitude'] = X['latitude'].replace(-2e-08, 0)
    
    # When columns have zeros and shouldn't, they are like null values.
    # So we will replace them with the column mean.
    cols_with_zeros = ['construction_year', 'longitude', 'latitude']
    for col in cols_with_zeros:
        X[col] = X[col].replace(0, np.nan)
        X[col] = X[col].fillna(X[col].mean())
        
    # Convert date_recorded to datetime
    X['date_recorded'] = pd.to_datetime(X['date_recorded'], infer_datetime_format=True)
    
    # Extract year from date_recorded
    X['year_recorded'] = X['date_recorded'].dt.year
    
    # quantity & quantity_group are duplicates, so drop one
    X = X.drop(columns='quantity_group')
    
    # for categoricals with missing values, fill with the category 'MISSING'
    categoricals = X.select_dtypes(exclude='number').columns
    for col in categoricals:
        X[col] = X[col].fillna('MISSING')
    
    return X

In [0]:
train_features = wrangle(train_features)
test_features = wrangle(test_features)

## Majority Class Baseline

In [0]:
y_train = train_labels['status_group']
y_train.value_counts(normalize=True)

functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

`functional` is the majority class.

In [0]:
# Our baseline model
majority_class = y_train.mode()[0]
y_pred = [majority_class] * len(y_train)

In [0]:
# Check the accuracy score
accuracy_score(y_train, y_pred)

0.543080808080808

## Train/Validate/Test Split

### *RUN THIS TO RESET TRAINING AND VALIDATION SETS*

In [0]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Doublecheck Stratification
assert(y_train.value_counts(normalize=True)[0] - y_val.value_counts(normalize=True)[0] < 0.01)
print('Stratification is fine')

Stratification is fine


## Basic Logistic Regression

In [0]:
# Only take numeric data
X_train_numeric = X_train.select_dtypes('number')
X_val_numeric = X_val.select_dtypes('number')

In [0]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train_numeric, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# Make predictions
y_pred = model.predict(X_val_numeric)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.5572390572390572

## Logistic Regression
* Numeric features except `id`
* OneHotEncode `quantity`

In [0]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')

# Add quantity
features = numeric_features
features.append('source')

features

['amount_tsh',
 'gps_height',
 'longitude',
 'latitude',
 'num_private',
 'region_code',
 'district_code',
 'population',
 'construction_year',
 'source']

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [0]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# Make predictions
y_pred = model.predict(X_val)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.5773569023569024

## Logistic Regression
* Numeric features except `id`
* OneHotEncode ``

In [0]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')
numeric_features.remove('num_private')

# Add features to be encoded
features = numeric_features
features.append('quantity')
features.append('installer')
features.append('extraction_type')
features.append('source')

# features

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [0]:
# Instantiate model
model = LogisticRegression(solver='lbfgs',
                         multi_class='auto',
                         max_iter=10000)

# Fit model
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
# Make predictions
y_pred = model.predict(X_val)

# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.7494949494949495

In [0]:
X_val.shape

(11880, 2187)

## Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier

In [0]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')
numeric_features.remove('num_private')

# Add features to be encoded
features = numeric_features
features.append('quantity')
features.append('installer')
features.append('extraction_type')
features.append('source')

# features

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [0]:
model = DecisionTreeClassifier(min_samples_leaf = 24)

In [0]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=24, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
y_train_pred = model.predict(X_train)
y_pred = model.predict(X_val)

In [0]:
# Test the accuracy of the predictions
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

Train Accuracy: 0.8023400673400674
Validation Accuracy: 0.8026094276094277


## Random Forest

In [0]:
from sklearn.ensemble import RandomForestRegressor

In [0]:
# Train/Test already Split
X_train = train_features
y_train = train_labels['status_group']

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  train_size=0.8,
                                                  test_size=0.2,
                                                  stratify=y_train,
                                                  random_state=66)

# Check the shape
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((47520, 40), (11880, 40), (47520,), (11880,))

In [0]:
# Find all the numeric features
X_train_numeric = X_train.select_dtypes(include='number')
numeric_features = list(X_train_numeric.columns.values)
numeric_features.remove('id')
numeric_features.remove('num_private')

# Add features to be encoded
features = numeric_features
features.append('quantity')
features.append('installer')
features.append('extraction_type')
features.append('source')

# features

In [0]:
# Train/Test already Split
X_train = train_features[features]
y_train = train_labels['status_group']

# Remove features we don't want from X_val
X_val = X_val[features]

# OneHotEncode `quantity`
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

# Scale the data (Unit Normal)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [0]:
model = RandomForestRegressor()

In [0]:
model.fit(X_train, y_train)



ValueError: ignored

In [0]:
y_pred = model.predict(X_val)

In [0]:
# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

NameError: ignored

## XGBoost

In [0]:
!pip install xgboost



In [0]:
from xgboost import XGBClassifier

In [0]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
y_pred = model.predict(X_val)

In [0]:
# Test the accuracy of the predictions
accuracy_score(y_val, y_pred)

0.7406565656565657

## Export a Submission

In [0]:
# Create & process y_test
X_test = test_features[features]

# Encode
X_test = encoder.transform(X_test)

# Scale to Unit Normal
X_test = scaler.transform(X_test)

In [0]:
# Make predictions
y_pred = model.predict(X_test)

In [0]:
# Send predictions to df
submission_df = pd.DataFrame()

# Add columns
submission_df['id'] = True
submission_df['id'] = test_features['id'].tolist()

submission_df['status_group'] = True
submission_df['status_group'] = y_pred

submission_df.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional


In [0]:
submission_df.to_csv('tree_1.1', index=False)