# Import Needed Libraries

In [22]:
# to import configured input and output paths
import configparser
# Dataframe building and manipulation
import pandas as pd
import numpy as np

# Vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# Importing Configuration
I like to switch between my desktop and laptop frequently. Thus, I am getting 
into the habit of creating a configuration file, config.ini. This file will
allow me to edit my paths in one location so that I don't have to constantly
update this in every notebook. This configuration file can be used for many 
purposes, such as saving usernames and passwords, and I can even add it to the
.gitignore file!

In [23]:
# import my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

# Importing Given Data

In [24]:
# Dataframes we will use
train_labels = pd.read_csv(config['paths']['data_path'] + 'training_set_labels.csv')
train_vals = pd.read_csv(config['paths']['data_path'] + 'training_set_values.csv')

#For competition submission
test_vals = pd.read_csv(config['paths']['data_path'] + 'test_set_values.csv')
sub_form = pd.read_csv(config['paths']['data_path'] + 'SubmissionFormat.csv')

In [25]:
# Checking number of entries and features
print('train_vals:', train_vals.shape)
print('train_labels', train_labels.shape)
print('test_vals', test_vals.shape)

train_vals: (59400, 40)
train_labels (59400, 2)
test_vals (14850, 40)


# Data Cleaning

In [26]:
df = pd.merge(train_vals, train_labels,left_on='id', right_on='id',how='outer')

In [27]:
final_df = df[['amount_tsh', 'gps_height', 'longitude', 'latitude', 'population',
               'public_meeting', 'basin', 'region','scheme_management', 'permit',
               'extraction_type', 'management_group', 'payment', 'water_quality',
               'quality_group', 'quantity', 'source_type', 'waterpoint_type_group',
               'status_group']]

In [28]:
def to_other(entry):
    if entry in ['SWC', 'Trust', 'None']:
        return 'Other'
    if entry is None:
        return 'Other'
    else:
        return entry

In [29]:
final_df.scheme_management.fillna(value='Other', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [30]:
final_df.scheme_management = final_df.scheme_management.apply(to_other)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [31]:
final_df.public_meeting.fillna(value=0, inplace=True)

In [32]:
final_df.permit.fillna(value=0, inplace=True)

In [33]:
final_df.head()

Unnamed: 0,amount_tsh,gps_height,longitude,latitude,population,public_meeting,basin,region,scheme_management,permit,extraction_type,management_group,payment,water_quality,quality_group,quantity,source_type,waterpoint_type_group,status_group
0,6000.0,1390,34.938093,-9.856322,109,True,Lake Nyasa,Iringa,VWC,False,gravity,user-group,pay annually,soft,good,enough,spring,communal standpipe,functional
1,0.0,1399,34.698766,-2.147466,280,0,Lake Victoria,Mara,Other,True,gravity,user-group,never pay,soft,good,insufficient,rainwater harvesting,communal standpipe,functional
2,25.0,686,37.460664,-3.821329,250,True,Pangani,Manyara,VWC,True,gravity,user-group,pay per bucket,soft,good,enough,dam,communal standpipe,functional
3,0.0,263,38.486161,-11.155298,58,True,Ruvuma / Southern Coast,Mtwara,VWC,True,submersible,user-group,never pay,soft,good,dry,borehole,communal standpipe,non functional
4,0.0,0,31.130847,-1.825359,0,True,Lake Victoria,Kagera,Other,True,gravity,other,never pay,soft,good,seasonal,rainwater harvesting,communal standpipe,functional


In [34]:
final_df[['public_meeting', 'permit']] = final_df[['public_meeting', 'permit', ]].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [35]:
X = pd.get_dummies(final_df.iloc[:,:-1])
y = final_df.status_group

# Saving Transformed Data

In [36]:
X.to_csv(config['paths']['data_path'] + 'X.csv')
y.to_csv(config['paths']['data_path'] + 'y.csv')

# Importing Transformed Data

In [37]:
X = pd.read_csv(config['paths']['data_path'] + 'X.csv')
y = pd.read_csv(config['paths']['data_path'] + 'y.csv')