# PumpItUp
## DrivenDataCompetition PumpItUp
- All data is within the same folder as notebook

In [1]:
import pandas as pd
import matplotlib as plt
import sklearn
#import seaborn as sns
import numpy as np

In [24]:
# load training-data and labels
trainingValues = pd.read_csv('4910797b-ee55-40a7-8668-10efd5c1b960.csv')
trainingLabels = pd.read_csv('0bf8bc6e-30d0-4c50-956a-603fc693d966.csv')
trainingData = pd.merge(trainingValues, trainingLabels, on= 'id')
trainingData.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,num_private,...,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,status_group
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,0,...,soft,good,enough,enough,spring,spring,groundwater,communal standpipe,communal standpipe,functional
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,0,...,soft,good,insufficient,insufficient,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,0,...,soft,good,enough,enough,dam,dam,surface,communal standpipe multiple,communal standpipe,functional
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,0,...,soft,good,dry,dry,machine dbh,borehole,groundwater,communal standpipe multiple,communal standpipe,non functional
4,19728,0.0,2011-07-13,Action In A,0,Artisan,31.130847,-1.825359,Shuleni,0,...,soft,good,seasonal,seasonal,rainwater harvesting,rainwater harvesting,surface,communal standpipe,communal standpipe,functional


# Investigation and Cleansing of data
Possible data-issues
* population == 0 -> Maybe delete all lines with population?
* num_private Not in feature-description -> Maybe delete column?
* object-data needs to be converted in category-numbers

In [3]:
def cleanData(data : pd.DataFrame, columns : list):
    #data = data[data.population != 0] # delete population == 0
    #data = data.drop('num_private', axis = 1)
    
    for columnName in columns:
        data[columnName] = data[columnName].fillna('unknown')
        data[columnName].str.lower()
        data = data.replace({columnName : data[columnName].value_counts().to_dict()})
    return data
    
def createCatColumnsWithCodes(data : pd.DataFrame):
    cat_columns = data.select_dtypes(['object']).columns
    if len(cat_columns) == 0:
        return data
    data[cat_columns] = data[cat_columns].astype('category')
    cat_data = data[cat_columns].apply(lambda x: x.cat.codes)
    data = data.join(cat_data, rsuffix='_cat')

    return data


# clean data
trainingData = cleanData(trainingData, ['funder', 'installer','wpt_name','basin'])

#change labels to category data
trainingData = createCatColumnsWithCodes(trainingData)


In [4]:
# Show data-frame top
trainingData.head()

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,...,water_quality_cat,quality_group_cat,quantity_cat,quantity_group_cat,source_cat,source_type_cat,source_class_cat,waterpoint_type_cat,waterpoint_type_group_cat,status_group_cat
0,69572,6000.0,2011-03-14,Roman,1390,Roman,34.938093,-9.856322,none,Lake Nyasa,...,6,2,1,1,8,6,0,1,1,0
1,8776,0.0,2013-03-06,Grumeti,1399,GRUMETI,34.698766,-2.147466,Zahanati,Lake Victoria,...,6,2,2,2,5,3,1,1,1,0
2,34310,25.0,2013-02-25,Lottery Club,686,World vision,37.460664,-3.821329,Kwa Mahundi,Pangani,...,6,2,1,1,0,1,1,2,1,0
3,67743,0.0,2013-01-28,Unicef,263,UNICEF,38.486161,-11.155298,Zahanati Ya Nanyumbu,Ruvuma / Southern Coast,...,6,2,0,0,3,0,0,2,1,2
5,9944,20.0,2011-03-13,Mkinga Distric Coun,0,DWE,39.172796,-4.765587,Tajiri,Pangani,...,4,4,1,1,4,2,2,2,1,0


In [None]:
# Describe data
trainingData.describe()

#check correlation
fig = plt.pyplot.gcf()
fig.set_size_inches(25,25)
sns.heatmap(trainingData.corr(), annot = True, fmt='.2g',cmap= 'coolwarm')

In [6]:
from sklearn.model_selection import train_test_split

#split in labels and features
labels = trainingData['status_group_cat']
features = trainingData.drop(['status_group_cat','id'],axis=1)
no_use_columns = features.select_dtypes(['category']).columns
features = features.drop(no_use_columns, axis=1)
features_list = features.columns

#split labels and features is train-data and test-data
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

#check shape
print('train_features: ', train_features.shape)
print('train_labels: ', train_labels.shape)
print('test_features: ', test_features.shape)
print('test_labels: ', test_labels.shape)

labels-shape: (59400,)
features-shape: (59400, 39)


# Try random forest


In [15]:
## Try random fit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import pickle

##create random grid
# number of trees in forest
n_estimators = [int(x) for x in np.linspace(start = 1500, stop= 1900, num = 100)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 5, 6, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method for selecting samples for training each node
bootstrap = [True, False]
# max features
max_features = ['auto', 'log2', 'sqrt']

random_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'max_features': max_features,
              'bootstrap': bootstrap}

randomforest = RandomForestClassifier()

randomforest_random = RandomizedSearchCV(estimator = randomforest, param_distributions=random_grid, n_iter = 100, cv= None, verbose= 20, random_state= 42, n_jobs = -1)
randomforest_random.fit(features, labels)
best_estimator = randomforest_random.best_estimator_

# Show parameters
print('Best parameters: ',randomforest_random.best_params_)

# Show Accuracy
predictions = best_estimator.predict(test_features)
print ('Accuracy-Score of randomforest is: ', sklearn.metrics.accuracy_score(test_labels, predictions))




def competitionOutput(filenameIn: string, filenameOut : string ,estimator):
    # prepare competition features
    competition_features = pd.read_csv(filenameIn)
    competition_features_prepared = cleanData(competition_features, ['funder', 'installer','wpt_name','basin'])
    competition_features_prepared = createCatColumnsWithCodes(competition_features_prepared)
    competition_features_prepared = competition_features_prepared.drop(['id'],axis=1)
    no_use_columns = competition_features_prepared.select_dtypes(['category']).columns
    competition_features_prepared = competition_features_prepared.drop(no_use_columns, axis=1)

    predictions = estimator.predict(competition_features_prepared)
    competition_result = pd.read_csv(filenameIn)
    competition_result['status_group'] = predictions
    competition_result = competition_result[['id', 'status_group']]
    competition_result['status_group'] = competition_result['status_group'].replace([0,1,2],['functional','functional needs repair', 'non functional'])

    #create output csv
    competition_result.to_csv(filenameOut, index=False)
    return

competitionOutput('702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv', 'predictionsV3.csv', best_estimator)


#store parameters in pkl-file
file = open('parameters.pkl', 'wb')
pickle.dump(randomforest_random.best_params_, file)
predictions = best_estimator.predict(test_features)
file.close()

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
