# PumpItUp
## DrivenDataCompetition PumpItUp
- All data is within the same folder as notebook

In [None]:
import pandas as pd
import matplotlib as plt
import sklearn
#import seaborn as sns
import numpy as np

In [None]:
# load training-data and labels
trainingValues = pd.read_csv('4910797b-ee55-40a7-8668-10efd5c1b960.csv')
trainingLabels = pd.read_csv('0bf8bc6e-30d0-4c50-956a-603fc693d966.csv')
trainingData = pd.merge(trainingValues, trainingLabels, on= 'id')
trainingData.head()

# Investigation and Cleansing of data
Possible data-issues
* population == 0 -> Maybe delete all lines with population?
* num_private Not in feature-description -> Maybe delete column?
* object-data needs to be converted in category-numbers
* idea: funders and installers who fund and install regularly are less porbable to create failing pumps than those who do it less often. Thererfore the names are exchanged with the number of occurances. Same for basin, wpt_name, subvillage, scheme_name and schmeme_management

In [None]:
def cleanData(data : pd.DataFrame, columns : list):
    #data = data[data.population != 0] # delete population == 0
    if 'num_private' in data.columns: # drop 'num_private'
        data = data.drop('num_private', axis = 1)
    
    data['public_meeting'] = data['public_meeting'].fillna('True') #fill nan with 'True'
    data['permit'] = data['permit'].fillna('True')
    
    for columnName in columns: # make all strings lower case and replace names with number of occurence.
        data[columnName] = data[columnName].fillna('unknown')
        data.loc[data[columnName].str.len() <=2, columnName] = 'unknown'
        data.loc[data[columnName] == 'none', columnName] = 'unknown'
        data[columnName].str.lower()
        data = data.replace({columnName : data[columnName].value_counts().to_dict()})
        
    #change data-recorded to year- and month-only-columns
    data['date_recorded'] = pd.to_datetime(data['date_recorded'], format='%Y-%m-%d')
    data['year_recorded'] =data['date_recorded'].dt.year
    data['year_recorded'] = data['year_recorded'].astype('object')

    data['month_recorded'] = data['date_recorded'].dt.month
    data['month_recorded'] = data['month_recorded'].astype('object')

    data = data.drop('date_recorded', axis = 1)
        
    return data
    
def createCatColumnsWithCodes(data : pd.DataFrame):
    cat_columns = data.select_dtypes(['object']).columns
    if len(cat_columns) == 0:
        return data
    data[cat_columns] = data[cat_columns].astype('category')
    cat_data = data[cat_columns].apply(lambda x: x.cat.codes)
    data = data.join(cat_data, rsuffix='_cat')

    return data


# clean data
trainingData = cleanData(trainingData, ['funder', 'installer','wpt_name','basin', 'subvillage','scheme_name', 'scheme_management'])

#change labels to category data
trainingData = createCatColumnsWithCodes(trainingData)

trainingData.to_csv('trainingData.csv')


In [None]:
trainingData.dtypes

In [None]:
# Show data-frame top
trainingData.head(100)

In [None]:
# Describe data
trainingData.describe()

#check correlation
fig = plt.pyplot.gcf()
fig.set_size_inches(25,25)
sns.heatmap(trainingData.corr(), annot = True, fmt='.2g',cmap= 'coolwarm')

In [None]:
from sklearn.model_selection import train_test_split

#split in labels and features
labels = trainingData['status_group_cat']
features = trainingData.drop(['status_group_cat','id'],axis=1)
no_use_columns = features.select_dtypes(['category']).columns
features = features.drop(no_use_columns, axis=1)
features_list = features.columns

#split labels and features is train-data and test-data
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

#check shape
print('train_features: ', train_features.shape)
print('train_labels: ', train_labels.shape)
print('test_features: ', test_features.shape)
print('test_labels: ', test_labels.shape)

# Try random forest


In [None]:
## Try random fit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import pickle

##create random grid
# number of trees in forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop= 2000, num = 100)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 5, 6, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4]
# Method for selecting samples for training each node
bootstrap = [True, False]
# max features
max_features = ['auto', 'log2', 'sqrt']

random_grid = {'n_estimators': n_estimators,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'max_features': max_features,
              'bootstrap': bootstrap}

randomforest = RandomForestClassifier()

randomforest_random = RandomizedSearchCV(estimator = randomforest, param_distributions=random_grid, n_iter = 100, cv= None, verbose= 20, random_state= 42, n_jobs = -1)
randomforest_random.fit(features, labels)
best_estimator = randomforest_random.best_estimator_

# Show parameters
print('Best parameters: ',randomforest_random.best_params_)

# Show Accuracy
predictions = best_estimator.predict(test_features)
print ('Accuracy-Score of randomforest is: ', sklearn.metrics.accuracy_score(test_labels, predictions))



In [None]:



def competitionOutput(filenameIn, filenameOut ,estimator):
    # prepare competition features
    competition_features = pd.read_csv(filenameIn)
    competition_features_prepared = cleanData(competition_features, ['funder', 'installer','wpt_name','basin', 'subvillage','scheme_name', 'scheme_management'])
    competition_features_prepared = createCatColumnsWithCodes(competition_features_prepared)
    competition_features_prepared = competition_features_prepared.drop(['id'],axis=1)
    no_use_columns = competition_features_prepared.select_dtypes(['category']).columns
    competition_features_prepared = competition_features_prepared.drop(no_use_columns, axis=1)

    predictions = estimator.predict(competition_features_prepared)
    competition_result = pd.read_csv(filenameIn)
    competition_result['status_group'] = predictions
    competition_result = competition_result[['id', 'status_group']]
    competition_result['status_group'] = competition_result['status_group'].replace([0,1,2],['functional','functional needs repair', 'non functional'])

    #create output csv
    competition_result.to_csv(filenameOut, index=False)
    return

competitionOutput('702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv', 'predictionsV3.csv', best_estimator)


#store parameters in pkl-file
file = open('parameters.pkl', 'wb')
pickle.dump(randomforest_random.best_params_, file)
predictions = best_estimator.predict(test_features)
file.close()