# Import Needed Libraries

In [1]:
# To import configurations from config.ini files
import configparser
# For dataframe processes
import pandas as pd
import numpy as np

# For vizualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import plot_confusion_matrix

# To display all columns
pd.set_option('display.max_columns', None)

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

# Importing Configuration
I am getting into the habit of creating a configuration file, config.ini. 
This file will allow me to edit my paths in one location so that I don't have to 
constantly update paths in every single file. This configuration file can even
be used for other purposes, such as saving usernames and passwords, and I can 
even add it to the .gitignore file.

In [2]:
# import and read my config.ini file
config = configparser.ConfigParser()
config.read("../src/config.ini")

['../src/config.ini']

# Importing Given Data
I will import the test data again later on. This is because I figure out what 
dtypes I should cast each column. I then convert this to a dictionary that can
serve as an input in *pd.read_csv()*.

In [100]:
# Output
output = config['paths']['data_path']

In [3]:
# Import training and testing data
train_lbls = pd.read_csv(config['paths']['train_labels'])
train_df = pd.read_csv(config['paths']['train_data'])
test_df = pd.read_csv(config['paths']['test_data'])

# Import
sub_form = pd.read_csv(config['paths']['sub_form'])

Viewing the shapes and heads of each dataset

In [4]:
# Checking the shape of each dataframe
print('train_values:', train_df.shape)
print('train_labels', train_lbls.shape)
print('sub_form', sub_form.shape)

train_values: (59400, 40)
train_labels (59400, 2)
sub_form (14850, 2)


# Data Cleaning

In [5]:
train_df.drop_duplicates(inplace=True)
train_df.shape

(59400, 40)

In [6]:
train_df.index = train_df['id']
train_df.drop(columns='id', inplace=True)

In [7]:
train_lbls.index = train_lbls['id']
train_lbls.drop(columns='id', inplace=True)

I see there are null values in my data set, so I will take a closer look at them
to decide what I can do. I also see that there are only three dtypes: 
float64, int64, and object. My first thought would to be see if I can cast any to
another dtype. This is so I have columns in a more efficient dtype and to save memory. However, this may not be completely necessary considering the dataset is very small(~19 MB).


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   amount_tsh             59400 non-null  float64
 1   date_recorded          59400 non-null  object 
 2   funder                 55765 non-null  object 
 3   gps_height             59400 non-null  int64  
 4   installer              55745 non-null  object 
 5   longitude              59400 non-null  float64
 6   latitude               59400 non-null  float64
 7   wpt_name               59400 non-null  object 
 8   num_private            59400 non-null  int64  
 9   basin                  59400 non-null  object 
 10  subvillage             59029 non-null  object 
 11  region                 59400 non-null  object 
 12  region_code            59400 non-null  int64  
 13  district_code          59400 non-null  int64  
 14  lga                    59400 non-null  object 
 15

Looks like I can easily cast *amount_tsh* to an int

These int64's can definitely be casted down

I can cast *date_recorded* to a datetime dtype instead of object. I also see a
few columns that I can cast to booleans as well.

In [9]:
def memory_reduction(df):
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type not in [object, 'datetime64']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if col == 'date_recorded':
                df[col] = pd.to_datetime(df[col])
            elif col_type == object:
                df[col] = df[col].astype('category')
            elif col in ['permit', 'public_meeting']:
                df[col] = df[col].astype(int)
            elif col == 'amount_tsh':
                df[col] = df[col].astype(np.int32)
            elif col == 'scheme_name':
                df[col] = df[col].apply(lambda x: str(x))
            else:
                continue

In [10]:
memory_reduction(train_df)

The training data now takes about 1/3rd of the space from before

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59400 entries, 69572 to 26348
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   amount_tsh             59400 non-null  float32       
 1   date_recorded          59400 non-null  datetime64[ns]
 2   funder                 55765 non-null  category      
 3   gps_height             59400 non-null  int16         
 4   installer              55745 non-null  category      
 5   longitude              59400 non-null  float16       
 6   latitude               59400 non-null  float16       
 7   wpt_name               59400 non-null  category      
 8   num_private            59400 non-null  int16         
 9   basin                  59400 non-null  category      
 10  subvillage             59029 non-null  category      
 11  region                 59400 non-null  category      
 12  region_code            59400 non-null  int8          
 1

I used the training dataset dtypes to create a dtype dictionary. Thus, I can 
import the testing dataset directly as the specified dtypes. I only exclude 
*date_recorded* since it has an error when importing. The work around was to
simply import the column as default and then apply the datetime transformation.

In [12]:
train_df.shape

(59400, 39)

In [13]:
dtype_dict = train_df.dtypes.apply(lambda c: c.name).to_dict()
# Don't need these columns of will cast them separately
del dtype_dict['date_recorded']
del dtype_dict['scheme_name']
del dtype_dict['public_meeting']
del dtype_dict['permit']

In [14]:
test_df = pd.read_csv(config['paths']['test_data'], dtype=dtype_dict)
# test_df['date_recorded'] = pd.to_datetime(test_df['date_recorded'])

In [15]:
test_df.index = test_df['id']

In [16]:
test_df.drop_duplicates(inplace=True)
test_df.drop(columns='id', inplace=True)
test_df.shape

(14850, 39)

In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14850 entries, 50785 to 68707
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   amount_tsh             14850 non-null  float32 
 1   date_recorded          14850 non-null  object  
 2   funder                 13981 non-null  category
 3   gps_height             14850 non-null  int16   
 4   installer              13973 non-null  category
 5   longitude              14850 non-null  float16 
 6   latitude               14850 non-null  float16 
 7   wpt_name               14850 non-null  category
 8   num_private            14850 non-null  int16   
 9   basin                  14850 non-null  category
 10  subvillage             14751 non-null  category
 11  region                 14850 non-null  category
 12  region_code            14850 non-null  int8    
 13  district_code          14850 non-null  int8    
 14  lga                    14850 non-n

# Feature Engineering + Null Values
Now I will handle missing values in the data and transform/engineer some colums.
Luckily for me, missing values are in the same columns for training and testing sets.

In [18]:
# append column to a list if there are any nulls in the column
null_cols = [[c,train_df[c].isnull().sum()/train_df.shape[0]] for c in train_df.columns if train_df[c].isnull().any()]
# show columns. They all appear to be categorical or binomial
null_cols

[['funder', 0.0611952861952862],
 ['installer', 0.061531986531986535],
 ['subvillage', 0.0062457912457912455],
 ['public_meeting', 0.05612794612794613],
 ['scheme_management', 0.06526936026936027],
 ['scheme_name', 0.4741750841750842],
 ['permit', 0.05144781144781145]]

In [19]:
# append column to a list if there are any nulls in the column
null_cols_test = [[c,test_df[c].isnull().sum()/test_df.shape[0]] for c in test_df.columns if test_df[c].isnull().any()]
# show columns. They all appear to be categorical or binomial
null_cols_test

[['funder', 0.05851851851851852],
 ['installer', 0.05905723905723906],
 ['subvillage', 0.006666666666666667],
 ['public_meeting', 0.055286195286195286],
 ['scheme_management', 0.06525252525252526],
 ['scheme_name', 0.4775757575757576],
 ['permit', 0.04962962962962963]]

In [20]:
train_df.drop(columns=['scheme_name', 'date_recorded'], inplace=True)

In [21]:
test_df.drop(columns=['scheme_name', 'date_recorded'], inplace=True)

In [22]:
for col in ['funder', 'installer', 'subvillage', 'scheme_management']:
    if col=='scheme_management':
        train_df[col] = train_df[col].cat.add_categories('Unknown')
        test_df[col] = test_df[col].cat.add_categories('Unknown')
    else:
        train_df[col] = train_df[col].cat.add_categories('Other')
        test_df[col] = test_df[col].cat.add_categories('Other')

In [23]:

train_df.fillna({'funder':'Other',
           'installer': 'Other',
           'subvillage': 'Other', 
           'public_meeting': False,
           'scheme_management': 'Unknown',
           'permit': False}
           , inplace=True)

In [24]:
test_df.fillna({'funder':'Other',
           'installer': 'Other',
           'subvillage': 'Other', 
           'public_meeting': False,
           'scheme_management': 'Unknown',
           'permit': False}
           , inplace=True)

In [25]:
train_df.isna().any().sum()

0

In [26]:
test_df.isna().any().sum()

0

In [27]:
train_df[['funder', 'installer', 'subvillage', 'public_meeting', 'scheme_management',
        'permit']]

Unnamed: 0_level_0,funder,installer,subvillage,public_meeting,scheme_management,permit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
69572,Roman,Roman,Mnyusi B,True,VWC,False
8776,Grumeti,GRUMETI,Nyamara,False,Other,True
34310,Lottery Club,World vision,Majengo,True,VWC,True
67743,Unicef,UNICEF,Mahakamani,True,VWC,True
19728,Action In A,Artisan,Kyanyamisa,True,Unknown,True
...,...,...,...,...,...,...
60739,Germany Republi,CES,Kiduruni,True,Water Board,True
27263,Cefa-njombe,Cefa,Igumbilo,True,VWC,True
37057,Other,Other,Madungulu,True,VWC,False
31282,Malec,Musa,Mwinyi,True,VWC,True


In [28]:
train_df.shape

(59400, 37)

In [29]:
cats = train_df.select_dtypes(include='category')
nums = train_df.select_dtypes(exclude='category')


In [30]:
cat_dict = {}
for col in cats:
    cat_dict[col] = len(cats[col].value_counts())

In [31]:
category_counts = {k: v for k, v in sorted(cat_dict.items(), key=lambda item: item[1], reverse=True)}
category_counts

{'wpt_name': 37400,
 'subvillage': 19288,
 'installer': 2146,
 'ward': 2092,
 'funder': 1898,
 'lga': 125,
 'region': 21,
 'extraction_type': 18,
 'scheme_management': 13,
 'extraction_type_group': 13,
 'management': 12,
 'source': 10,
 'basin': 9,
 'water_quality': 8,
 'extraction_type_class': 7,
 'payment': 7,
 'payment_type': 7,
 'source_type': 7,
 'waterpoint_type': 7,
 'quality_group': 6,
 'waterpoint_type_group': 6,
 'management_group': 5,
 'quantity': 5,
 'quantity_group': 5,
 'source_class': 3,
 'public_meeting': 2,
 'permit': 2,
 'recorded_by': 1}

In [32]:
for col in cats.columns:
    print(col)
    le = LabelEncoder()
    cats[col] = le.fit_transform(cats[col])


funder
installer
wpt_name
basin
subvillage
region
lga
ward
public_meeting
recorded_by
scheme_management
permit
extraction_type
extraction_type_group
extraction_type_class
management
management_group
payment
payment_type
water_quality
quality_group
quantity
quantity_group
source
source_type
source_class
waterpoint_type
waterpoint_type_group


In [33]:
train_df_exp = cats.merge(nums, left_index=True, right_index=True)

In [34]:
rf = RandomForestClassifier(random_state=42)

In [35]:
rf.fit(train_df_exp, train_lbls)

  rf.fit(train_df_exp, train_lbls)


In [36]:
importances = dict(zip(train_df_exp.columns, rf.feature_importances_))

In [37]:
{k: v for k, v in sorted(importances.items(), key=lambda item: item[1], reverse=True)}

{'latitude': 0.08181526878002009,
 'wpt_name': 0.0760373661646006,
 'quantity': 0.07365315338001982,
 'subvillage': 0.06540170991187305,
 'quantity_group': 0.06431780544622547,
 'longitude': 0.06141382401797456,
 'gps_height': 0.05483337193156758,
 'ward': 0.04162021020133068,
 'construction_year': 0.040097765868670504,
 'population': 0.03714835116065888,
 'waterpoint_type': 0.03503917149503383,
 'funder': 0.03450521455694876,
 'waterpoint_type_group': 0.029306038314549396,
 'installer': 0.027228593454135867,
 'lga': 0.024032228779419507,
 'extraction_type_class': 0.022020162098608673,
 'amount_tsh': 0.019323809264428064,
 'payment': 0.017621442087186683,
 'extraction_type_group': 0.01689864749420747,
 'district_code': 0.015827429370687814,
 'extraction_type': 0.015396771387403837,
 'region': 0.01473103296745933,
 'payment_type': 0.014655976757191833,
 'region_code': 0.01407618330886256,
 'source': 0.01348145486684026,
 'management': 0.013267788945915808,
 'scheme_management': 0.012990

In [38]:
corr_matrix = train_df_exp.corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


Unnamed: 0,funder,installer,wpt_name,basin,subvillage,region,lga,ward,public_meeting,recorded_by,scheme_management,permit,extraction_type,extraction_type_group,extraction_type_class,management,management_group,payment,payment_type,water_quality,quality_group,quantity,quantity_group,source,source_type,source_class,waterpoint_type,waterpoint_type_group,amount_tsh,gps_height,longitude,latitude,num_private,region_code,district_code,population,construction_year
funder,,0.547373,0.001516,0.132456,0.021835,0.150219,0.029671,0.01164,0.029277,,0.009799,0.057965,0.022661,0.010749,0.024928,0.051476,0.026953,0.039548,0.048716,0.034334,0.024917,0.044186,0.044186,0.085366,0.092872,0.041615,0.05006,0.035129,0.003413,0.013124,0.03743,0.027201,0.010734,0.017316,0.018477,0.01024,0.099034
installer,,,0.001714,0.102048,0.024565,0.141996,0.13036,0.002677,0.004935,,0.02632,0.049227,0.003649,0.018917,0.057793,0.017739,0.042735,0.00443,0.022377,0.015132,0.008955,0.054181,0.054181,0.10124,0.09494,0.058344,0.021476,0.011787,0.010294,0.00666,0.00965,0.00158,0.01355,0.018703,0.023926,0.01189,0.039931
wpt_name,,,,0.01678,0.091068,0.043538,0.037952,0.00052,0.01877,,0.065432,0.0511,0.003091,0.000737,0.019227,0.051926,0.025429,0.047708,0.056838,0.01415,0.017761,0.020468,0.020468,0.001761,0.000277,0.013852,0.026533,0.023626,0.014664,0.055669,0.017383,0.062242,0.0027,0.011861,0.006808,0.002178,0.00834
basin,,,,,0.057696,0.11488,0.006718,0.043157,0.017717,,0.05674,0.134764,0.16745,0.189307,0.177939,0.061139,0.068413,0.017788,0.029961,0.075529,0.044721,0.026707,0.026707,0.061457,0.074808,0.031636,0.009864,0.013518,0.013977,0.161768,0.216101,0.218504,0.018325,0.139887,0.189692,0.066899,0.268151
subvillage,,,,,,0.011206,0.009567,0.06601,0.030165,,0.009136,0.03246,0.011332,0.005237,0.017299,0.003358,0.003202,0.021207,0.022473,0.002076,0.013975,0.006003,0.006003,0.01999,0.025664,0.012342,0.017225,0.019984,2.8e-05,0.003936,0.009325,0.007442,0.011122,0.018479,0.023079,0.010631,0.046751
region,,,,,,,0.193919,0.051451,0.109037,,0.029334,0.04982,0.217222,0.233877,0.210724,0.076552,0.023112,0.054115,0.056709,0.073382,0.129883,0.034766,0.034766,0.156225,0.140439,0.010899,0.279421,0.257183,0.019866,0.323833,0.142766,0.025485,0.037351,0.109343,0.021688,0.004416,0.141681
lga,,,,,,,,0.061165,0.017387,,0.05213,0.080051,0.024245,0.007504,0.002427,0.082065,0.012648,0.110791,0.182848,0.016568,0.030434,0.014416,0.014416,0.065904,0.061295,0.02199,0.030154,0.03376,0.010315,0.137375,0.227476,0.268501,0.000843,0.042477,0.110921,0.004088,0.133283
ward,,,,,,,,,0.006094,,0.006398,0.030837,0.002728,0.004033,0.024567,0.002292,0.0456,0.002814,0.003249,0.011279,0.001478,0.001793,0.001793,0.010146,0.021475,0.030208,0.002778,0.004128,9e-05,0.043386,0.037396,0.00759,0.014877,0.029512,0.044599,0.029582,0.078459
public_meeting,,,,,,,,,,,0.156408,0.140349,0.078362,0.098466,0.119903,0.09726,0.218954,0.094398,0.203768,0.025907,0.079779,0.068698,0.068698,0.071273,0.050397,0.019172,0.087268,0.090392,0.025683,0.032375,0.078823,0.034384,0.01123,0.044806,0.023855,0.000398,0.00749
recorded_by,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [39]:
# Find features with correlation greater than 0.95
potential_drops = [column for column in upper.columns if any(upper[column] > 0.6)]
potential_drops
# Drop features 


['extraction_type_group',
 'extraction_type_class',
 'management',
 'payment_type',
 'quantity_group',
 'source_type',
 'waterpoint_type_group',
 'district_code',
 'construction_year']

These are columns that I'm dropping based on correlation and feature importance.
I am including some of the features with low importance to see how they impact
the final model.

In [40]:
drop_final = ['extraction_type', 'extraction_type_group',
              'management_group',
              'payment_type',
              'quantity_group',
              'source_type','source_class', 
              'waterpoint_type_group',
              'district_code', 
              'construction_year',
              'num_private',
              'recorded_by']

In [41]:
df_final = train_df.drop(columns=drop_final)

In [42]:
test_df = test_df.drop(columns=drop_final)

In [43]:
category_counts

{'wpt_name': 37400,
 'subvillage': 19288,
 'installer': 2146,
 'ward': 2092,
 'funder': 1898,
 'lga': 125,
 'region': 21,
 'extraction_type': 18,
 'scheme_management': 13,
 'extraction_type_group': 13,
 'management': 12,
 'source': 10,
 'basin': 9,
 'water_quality': 8,
 'extraction_type_class': 7,
 'payment': 7,
 'payment_type': 7,
 'source_type': 7,
 'waterpoint_type': 7,
 'quality_group': 6,
 'waterpoint_type_group': 6,
 'management_group': 5,
 'quantity': 5,
 'quantity_group': 5,
 'source_class': 3,
 'public_meeting': 2,
 'permit': 2,
 'recorded_by': 1}

In [44]:
def lower_features(df, df2):
    for col in ['wpt_name', 'subvillage', 'installer', 'ward', 'funder', 'lga']:
        temp = df[col].value_counts().head(20).keys()
        df[col] = df[col].apply(lambda x: 'Other' if x not in temp else x)
        df2[col] = df2[col].apply(lambda x: 'Other' if x not in temp else x)

In [45]:
lower_features(df_final, test_df)

In [46]:
df_final[['public_meeting', 'permit']] = df_final[['public_meeting', 'permit']].astype('boolean')

In [47]:
df_final[['funder', 'installer', 'wpt_name', 'subvillage', 'lga', 'ward']] = df_final[
    ['funder', 'installer', 'wpt_name', 'subvillage', 'lga', 'ward']].astype('category')

In [48]:
test_df[['funder', 'installer', 'wpt_name', 'subvillage', 'lga', 'ward']] = test_df[
    ['funder', 'installer', 'wpt_name', 'subvillage', 'lga', 'ward']].astype('category')

In [49]:
df_final.shape

(59400, 25)

In [50]:
cats = df_final.select_dtypes(include='category')
nums = df_final.select_dtypes(exclude='category')

In [51]:
cats_test = test_df.select_dtypes(include='category')
nums_test = test_df.select_dtypes(exclude='category')

In [52]:
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(cats)

In [84]:
train_ohe = pd.DataFrame(ohe.transform(cats).toarray(), columns = ohe.get_feature_names(), index=cats.index)
test_ohe = pd.DataFrame(ohe.transform(cats_test).toarray(), columns = ohe.get_feature_names(), index = cats_test.index)



In [85]:
train_ohe.shape

(59400, 228)

In [86]:
nums.shape

(59400, 8)

In [87]:
test_ohe.shape

(14850, 228)

In [88]:
nums_test.shape

(14850, 8)

In [89]:
df_final = nums.merge(train_ohe, left_index=True, right_index=True, how='inner')
test_final = nums_test.merge(test_ohe, left_index=True, right_index=True, how='inner')

In [113]:
df_final.to_csv(output+'training_set_cleaned.csv')
test_final.to_csv(output+'testing_set_cleaned.csv')
train_lbls.to_csv(output+'train_lbls_cleaned.csv')

In [90]:
df_final.shape

(59400, 236)

In [91]:
train_ohe

Unnamed: 0_level_0,x0_0,x0_Danida,x0_Dhv,x0_District Council,x0_Dwsp,x0_Germany Republi,x0_Government Of Tanzania,x0_Hesawa,x0_Kkkt,x0_Ministry Of Water,x0_Norad,x0_Other,x0_Private Individual,x0_Rwssp,x0_Tasaf,x0_Tcrs,x0_Unicef,x0_Water,x0_World Bank,x0_World Vision,x1_0,x1_CES,x1_Central government,x1_Commu,x1_Community,x1_DANID,x1_DANIDA,x1_DWE,x1_District Council,x1_Government,x1_HESAWA,x1_Hesawa,x1_KKKT,x1_LGA,x1_Other,x1_RWE,x1_TASAF,x1_TCRS,x1_WEDECO,x1_World vision,x2_Bombani,x2_Hospital,x2_Kanisani,x2_Kituo Cha Afya,x2_Madukani,x2_Mbugani,x2_Mkombozi,x2_Mkuyuni,x2_Msikitini,x2_Muungano,x2_Ofisini,x2_Other,x2_School,x2_Sekondari,x2_Shule,x2_Shule Ya Msingi,x2_Shuleni,x2_Sokoni,x2_Upendo,x2_Zahanati,x2_none,x3_Internal,x3_Lake Nyasa,x3_Lake Rukwa,x3_Lake Tanganyika,x3_Lake Victoria,x3_Pangani,x3_Rufiji,x3_Ruvuma / Southern Coast,x3_Wami / Ruvu,x4_1,x4_I,x4_Kanisani,x4_Kati,x4_Kibaoni,x4_M,x4_Madukani,x4_Majengo,x4_Mapinduzi,x4_Mbuyuni,x4_Miembeni,x4_Mjimwema,x4_Mlimani,x4_Msikitini,x4_Mtakuja,x4_Muungano,x4_Other,x4_Shuleni,x4_Sokoni,x4_Songambele,x5_Arusha,x5_Dar es Salaam,x5_Dodoma,x5_Iringa,x5_Kagera,x5_Kigoma,x5_Kilimanjaro,x5_Lindi,x5_Manyara,x5_Mara,x5_Mbeya,x5_Morogoro,x5_Mtwara,x5_Mwanza,x5_Pwani,x5_Rukwa,x5_Ruvuma,x5_Shinyanga,x5_Singida,x5_Tabora,x5_Tanga,x6_Arusha Rural,x6_Bagamoyo,x6_Bariadi,x6_Kahama,x6_Karagwe,x6_Kasulu,x6_Kibondo,x6_Kigoma Rural,x6_Kilombero,x6_Kilosa,x6_Kyela,x6_Magu,x6_Maswa,x6_Mbozi,x6_Meru,x6_Moshi Rural,x6_Njombe,x6_Other,x6_Rungwe,x6_Same,x6_Singida Rural,x7_Chalinze,x7_Chanika,x7_Igosi,x7_Imalinyi,x7_Itete,x7_Kitunda,x7_Maji ya Chai,x7_Maramba,x7_Matola,x7_Mdandu,x7_Mishamo,x7_Msindo,x7_Mtwango,x7_Nduruma,x7_Ngarenanyuki,x7_Other,x7_Siha Kati,x7_Usuka,x7_Vikindu,x7_Wanging'ombe,x7_Zinga/Ikerege,x8_Company,x8_None,x8_Other,x8_Parastatal,x8_Private operator,x8_SWC,x8_Trust,x8_Unknown,x8_VWC,x8_WUA,x8_WUG,x8_Water Board,x8_Water authority,x9_gravity,x9_handpump,x9_motorpump,x9_other,x9_rope pump,x9_submersible,x9_wind-powered,x10_company,x10_other,x10_other - school,x10_parastatal,x10_private operator,x10_trust,x10_unknown,x10_vwc,x10_water authority,x10_water board,x10_wua,x10_wug,x11_never pay,x11_other,x11_pay annually,x11_pay monthly,x11_pay per bucket,x11_pay when scheme fails,x11_unknown,x12_coloured,x12_fluoride,x12_fluoride abandoned,x12_milky,x12_salty,x12_salty abandoned,x12_soft,x12_unknown,x13_colored,x13_fluoride,x13_good,x13_milky,x13_salty,x13_unknown,x14_dry,x14_enough,x14_insufficient,x14_seasonal,x14_unknown,x15_dam,x15_hand dtw,x15_lake,x15_machine dbh,x15_other,x15_rainwater harvesting,x15_river,x15_shallow well,x15_spring,x15_unknown,x16_cattle trough,x16_communal standpipe,x16_communal standpipe multiple,x16_dam,x16_hand pump,x16_improved spring,x16_other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1
69572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
34310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
67743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
19728,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60739,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
27263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
37057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
31282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [92]:
rf = RandomForestClassifier(random_state=42)

In [93]:
rf.fit(df_final, train_lbls)

  rf.fit(df_final, train_lbls)


In [94]:
preds = rf.predict(test_final)

In [97]:
sub_form['status_group'] = preds

In [102]:
import datetime as dt

In [108]:
dt.datetime.now().strftime("%d%m%Y_%I%M%p")

'24072022_1152PM'

In [110]:
current_time = dt.datetime.now().strftime("%d%m%Y_%I%M%p")
sub_form.to_csv(output+ 'my_submission'+current_time+'.csv',index=False)

In [115]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [112]:
ab = AdaBoostClassifier(random_state=42)
ab.fit(df_final, train_lbls)
preds = ab.predict(test_final)
sub_form['status_group'] = preds
dt.datetime.now().strftime("%d%m%Y_%I%M%p")
current_time = dt.datetime.now().strftime("%d%m%Y_%I%M%p")
sub_form.to_csv(output+ 'my_submission'+current_time+'.csv',index=False)

  y = column_or_1d(y, warn=True)


In [116]:
dectree = DecisionTreeClassifier(random_state=42)
dectree.fit(df_final, train_lbls)
preds = dectree.predict(test_final)
sub_form['status_group'] = preds
dt.datetime.now().strftime("%d%m%Y_%I%M%p")
current_time = dt.datetime.now().strftime("%d%m%Y_%I%M%p")
sub_form.to_csv(output+ 'my_submission'+current_time+'.csv',index=False)