In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


from sklearn.model_selection import cross_val_score
import xgboost as xgb
import lightgbm as lgb

from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler, Normalizer, FunctionTransformer
from sklearn.preprocessing import OneHotEncoder


from sklearn.model_selection import GridSearchCV

import seaborn as sns

import missingno as msno 

dashes = dashes='---'*20

import warnings
warnings.filterwarnings("ignore")

### Introduction
According to the World Health Organization, one out of six people in Tanzania lack access to safe drinking water. Women and children must walk long distances to find water.  In rural Tanzania people walk for 2 to 3 km daily in search of water from public taps where available or natural streams.  They must carrying heavy containers for the water on their heads of about 20 to 25 litres per trip.  When they find water there is a problem of long lines at the point of water tap or boreholes where people spend hours to wait for their turn.  The water shortage has been caused by population growth, high level consumption and climate change which has reduced the resources of water.  Water shortages lead to poor sanitation, lack of safe drinking water, and overcrowding at water sources. projectzawadi.org.

The Tanzanian Ministry of Water is seeking to solve the ongoing water crisis in Tanzania by increasing the number of functioning water wells.  It is crucial to the health and safety of communities that its residents have access to drinking water.  Tanzania has a lot of water wells that are non-functioning or in need of repair.  Predictive modeling can be used to aid in solving this problem. <br> 
<BR>
My objective is to build classification models that will predict the operating status of water wells based on features in the dataset.  The data was gathered by Taarifa from the Tanzanian Ministry of Water and consist of over 59,400 data points of water well pumps in Tanzania and 40 features. The data consists of features regarding the pumps, such as installer, install date, location and pump type, etc.  The data also includes a target variable indicating the status of the functionality of the pumps.  The functioning status of the wells are identifed as  functioning, non-functioning or functioning but in need of repairs.  
<BR>
In addition, I will provide an analysis and visualizations of the data to provide insights and information of how various features impact the operating status of water wells, e.g., does location of the wells impact the liklihood the wells will be functioning or non-functioning.  

## The Data

In [None]:
pd.set_option('display.max_columns', None)  
df_1 = pd.read_csv('data/water_table.csv')
labels = pd.read_csv('data/water_table_labels.csv')
df_1['target'] = labels['status_group']

In [None]:
df_1['target'] = df_1['target'].map({'functional':0,'non functional':1,'functional needs repair':2})
df_1.drop('id',axis=1, inplace=True)

In [None]:
df_1.head()

In [None]:
df_1.info()

In [None]:
df_1['date_recorded'].max()

## Data Cleaning

The following are issues with the data that need to be addressed.  These issues will be addressed by issue and feature.<br>
There are several features that have Null values.  We can impute the Null values with medium values of the feature.<br>
There are several columns have a 0 as the category and constitutes a significant part of the feature. 
There features are duplicates of other features in the dataset.  We will not include duplicate features in the dataset.<br>
There are several features that have object values and numerous unique features.  The features will need to be One-Hot encoded for modeling purposes. 

### Null Values
The features funder, installer, subvillage, public meeting, scheme_management, scheme_name, permit have object values and NaN values.  We will impute the NaN values with the median values of the features.

In [None]:
df_1.isna().sum()

In [None]:
def count_value_status(data, feature):
    #function provides count and percentage of each value
    vc= df_2[feature]
    count = vc.value_counts(dropna=False)
    percent = vc.value_counts(normalize=True)
    percent100 = vc.value_counts(normalize=True).mul(100).round(2).astype(str) + '%'
    count_df = pd.DataFrame({'count': count, 'per': percent,'%': percent100})
    return count_df 

In [None]:
df_2 = df_1.copy()

#### amount_tsh
This feature is defined as the total static head or the amount of water avaliable to waterpoint. According to below, 70% of the values of this feature are 0.  This is too big of a percentage to do any imputing of the values.

In [None]:
count_value_status(df_2, 'amount_tsh')

### funder
This feature has 3635 NaN values.  There is a value of 'O' so we can group all NaN value with those of 'O'.  In addition, this feature values are non-numeric and there are 1,897 unique values.  The unique values will be need to be One-Hot Encoded for modeling purposes.  We  will group all unique values of 100 or less and put them in a categery entitled 'Unknown'. 

In [None]:
count_value_status(df_2, 'funder')

In [None]:
df_2['funder'] = df_2['funder'].replace(np.nan, '0')

In [None]:
print(f"The number of unique categories in the feature funder are: {len(df_2['funder'].unique())}") 

In [None]:
df_2 = df_2.apply(lambda x: x.mask(x.map(x.value_counts())< 100, 'other') if x.name=='funder' else x)

In [None]:
#pd.set_option('display.max_rows', None)
df_2['funder'].value_counts(dropna=False)

In [None]:
df_2['funder'].isna().sum()

#### gps_height 
This feature has a value of 0 which is 34.4% of the feature.   

In [None]:
count_value_status(df_2, 'gps_height') 

In [None]:
mean = df_2['gps_height'][df_2['gps_height'] != 0].mean()
df_2['gps_height'].replace(0, mean, inplace=True)

In [None]:
count_value_status(df_2, 'gps_height') 

#### installer
This feature is defined as the organization that installed the well.

In [None]:
count_value_status(df_2, 'installer')

#### Longitude
This feature is defined as the GPS coordinate.  #% of this feature has a value of 0.  I used the mean value of the feature to impute the 0 value.

In [None]:
count_value_status(df_2, 'longitude') 

In [None]:
longitude_mean = df_2[df_2['longitude']>0]['longitude'].mean()
df_2.loc[df_2['longitude']==0.00, 'longitude'] = float(longitude_mean)

In [None]:
df_2['longitude'].value_counts(normalize=True)

#### Latitude
This feature is defined as the GPS coordinate.

In [None]:
#df_2['latitude'].value_counts(normalize=True)
count_value_status(df_2, 'latitude') 

In [None]:
latitude_mean = df_2[df_2['latitude']>0]['latitude'].mean()
df_2.loc[df_2['latitude']==0, 'latitude'] = float(latitude_mean)

#### wpt_name and num_private
wpt_name is defined as the name of the name of the well.  This feature is an object and has 37,400 unique values.  This would be toom any to One_Hot Encode for modeling purposes and will not be included in the dataset.<br>

For num_private, 98.7% of the feature has a value of 0.  This is too many values to impute and will not be included in the dataset.  

In [None]:
df_2['wpt_name'].nunique()

In [None]:
count_value_status(df_2, 'num_private') 

#### basin
This is feature is defined as the geographic water basin.

In [None]:
count_value_status(df_2, 'basin') 

In [None]:
"""
subvillage : Geographic location
region : Geographic location

region_code : Geographic location (coded)
district_code : Geographic location (coded)

"""
count_value_status(df_2, 'region_code') 

In [None]:
count_value_status(df_2, 'district_code') 

#### LGA and Ward
These features are both defined as the geographic location of the wells.  LGA  has 125 unique value and ward has 2092 unique values.

In [None]:
"""
lga                    59400 non-null  object 
 15  ward  
"""
count_value_status(df_2, 'lga') 

In [None]:
len(count_value_status(df_2, 'ward')) 

In [None]:
df_2 = df_2.apply(lambda x: x.mask(x.map(x.value_counts())< 100, 'Other') if x.name=='lga' else x)

In [None]:
count_value_status(df_2, 'lga') 

#### Population
population is defined as the population around the well.  36% of water wells have 0 population around them.


In [None]:
count_value_status(df_2, 'population') 

In [None]:
population_mean = df_2[df_2['population']>0]['population'].mean()
df_2.loc[df_2['population']==0, 'population'] = int(population_mean)

In [None]:
count_value_status(df_2, 'population') 

#### public_meeting

In [None]:
count_value_status(df_2, 'public_meeting') 

In [None]:
m=df_2["public_meeting"].mode()[0]
df_2["public_meeting"].fillna(m,inplace=True)

In [None]:
df_2['public_meeting'] = list(map(int, df_2['public_meeting']))

In [None]:
count_value_status(df_2, 'public_meeting') 

#### scheme_management
scheme_management : Who operates the waterpoint
scheme_name : Who operates the waterpoint

In [None]:
count_value_status(df_2, 'scheme_management') 

In [None]:
df_2['scheme_management'] = np.where(df_2['scheme_management'].isnull(),"Unknown",df_2['scheme_management'])

In [None]:
count_value_status(df_2, 'scheme_management') 

In [None]:
"""
permit is defined as if the well is permitted or not.
There are 38852 NaN value.  We can impute NaN to the category with most values.  We need to als also convet the boleean values to 
numeric.

"""
count_value_status(df_2, 'permit') 

In [None]:
#m=df_2["permit"].mode()


In [None]:
m=df_2["permit"].mode()[0]
df_2["permit"].fillna(m,inplace=True)

In [None]:
count_value_status(df_2, 'permit') 

In [None]:
df_2['permit'] = list(map(int, df_2['permit']))

In [None]:
count_value_status(df_2, 'permit') 

#### construction_year
The feature construction_year is defined as the year the waterpoint was constructed.  Unfortnately, 34% of the feature is categorized as 0.  I will replacing the 0 values from the construction year column with the average year year values for the feature. With the cleaned construction year feature I will use feature engineering to create a pump age and an average popultion served per year feature.

In [None]:
count_value_status(df_2, 'construction_year') 

In [None]:
"""
X_test['construction_year'] = X_test['construction_year'].replace({0:1993})

mean_yr = df_2[df_2['construction_year']>0]['construction_year'].mean()
df_2.loc[df_2['construction_year']==0, 'construction_year'] = int(mean_yr)
"""
mean = df_2['construction_year'][df_2['construction_year'] != 0].mean()
df_2.construction_year.replace(0, mean, inplace=True)

In [None]:
count_value_status(df_2, 'construction_year') 

#### extraction_type_class
The features extraction_type, extraction_type_group, extraction_type_class are defined as the kind of extraction the waterpoint uses.  I only included extraction_type_class in the dataset.

In [None]:
count_value_status(df_2, 'extraction_type_class') 

#### management
Both management and management_group are defined as how the waterpoints are managed.  I only included management in the dataset.

In [None]:
count_value_status(df_2, 'management') 

In [None]:
df_2.groupby(['management', 'management_group']).size()

#### payment
The features payment and payment_type are defined as what the water costs.  I only included payment in the dataset. 

In [None]:
"""
payment
payment-what the water costs
payment_type
"""
count_value_status(df_2, 'payment') 

#### quality_group
Both water_quality and quality_group are defined as the quality of the water.  I only included quality_group in the dataset.

In [None]:
"""
water_quality - the quality of water
quality_group - the quality of water
"""
count_value_status(df_2, 'quality_group') 

#### quantity_group

In [None]:
count_value_status(df_2, 'quantity_group') 

#### Source
The features source, source_type and source_class are defined as the source of the water.  The features are duplicates with some variations.  I included only source in the dataset.

In [None]:
count_value_status(df_2, 'source') 

#### waterpoint_type
Both features waterpoint_type and waterpoint_type_group have the same defintion and are defined as the kind of waterpoint.  I only included waterpoint_type in the dataset. 

In [None]:
count_value_status(df_2, 'waterpoint_type') 

In [None]:
df_4 = df_2.copy()

#### age
With feature engineering I created a feature to identify the age of the wells.

In [None]:
# features['construction_year'] = features['construction_year'].replace({0:1993})
df_2['age'] = df_2['date_recorded'].astype(str).str[:4].astype(int) - df_2['construction_year']
df_2['pop_per_year'] = df_2['population'].replace({0:1}) / df_2['age'].replace({0:1})

In [None]:
count_value_status(df_2, 'age') 

In [None]:
# X_test['age'] = X_test['date_recorded'].astype(str).str[:4].astype(int) - X_test['construction_year']
# X_test['pop/year'] = X_test['population'].replace({0:1}) / X_test['age'].replace({0:1})
df_2.head()

## Exploratory Data Analysis 

In [None]:
df_2.describe()

In [None]:
def feature_w_hue(df, col= None, hue_col=None, rot=None, figsize=None):
    """
    plots a seaborn countplot for column and hue w/customization
    Args
    df (dataframe)
    col (int or str)
    hue_col (int or str)
    rot(rotate x label)
    figsize (dict)
    """
#     print(df[col].value_counts(dropna=False))
#     print('\n')
#     print(round(df[col].value_counts(normalize=True),3))
#     dashes = dashes='---'*15
#     print(dashes)
    
    fig,ax = plt.subplots(figsize=figsize)
    #sns.countplot(data=data, x=col, ax=ax)
    sns.countplot(data=df, x=col, hue=hue_col, ax=ax)
    label_font = {'weight':'bold','size':15}
    ax.set_ylabel('Counts',fontdict=label_font)
    ax.set_xlabel(col,fontdict=label_font)
    #ax.set_title(f'Distribution of {col.title()}',fontdict=label_font)
    ax.set_title(f'How {col.title()} relates to {hue_col.title()}',fontdict=label_font)
    ax.set_xticklabels(ax.get_xticklabels(), fontdict={'rotation':rot,'ha':'right'}); 
    #ax.set_xticklabels(ticklabels)

In [None]:
def feature_distribution(data, col= None, ticklabels=None, figsize=None):
    """
    plots a seaborn countplot for feature w/customization
    
    Args
        df (df)
        col (int or str)
        figsize (dict)
    """
    print(data[col].value_counts(dropna=False))
    print('\n')
    print(round(data[col].value_counts(normalize=True),3))
    dashes = dashes='---'*15
    print(dashes)
    
    fig,ax = plt.subplots(figsize=figsize)
    sns.countplot(data=data, x=col, ax=ax)
    label_font = {'weight':'bold','size':15}
    ax.set_ylabel('Counts',fontdict=label_font)
    ax.set_xlabel(col,fontdict=label_font)
    ax.set_title(f'Distribution of {col.title()}',fontdict=label_font)
    ax.set_xticklabels(ticklabels)

#### Target
The target is imbalanced because category 0 which is 54% of the feature, category 1 is 38% and category 2 is 0.07% of the feature.  The imblance could impact the model's performance so it will be addressed further below. 

In [None]:
feature_distribution(df_1, col='target',ticklabels=['functional / 0','non-functional / 1','functional needs repair / 2'],figsize=(8,5))

#### district_code
The below plot shows that districts 1,2,3 4 have the highest number of performing wells.  However, there is also a large number of non-functioning wells.  

In [None]:
feature_w_hue(df_2, col='district_code', hue_col='target',rot=40,figsize=(14,5))

#### funder
We were looking to see if the top funders give us some indcation of the functionlaity of the wells.  Other is the largest category which isn't helpful because it's not specific. Government Of Tanzania is the next largest group and they have funded 9084 wells.  A little over 4,000 wells are functioning but around 5,000 wells aren't functioning.  

In [None]:
df_3 = df_2.copy()

In [None]:
df_2['funder'].value_counts(dropna=False).head(20)


In [None]:
funder_10 = df_3[(df_3['funder'] == 'other') | (df_3["funder"] == 'Government Of Tanzania')
                 |(df_3["funder"] == '0')| (df_3["funder"] == 'Danida')
                 |(df_3["funder"] == 'Hesawa') |(df_3["funder"] == 'Rwssp') 
                 |(df_3["funder"] == 'World Bank') |(df_3["funder"] == 'Kkkt')
                 | (df_3["funder"] == 'World Vision')| (df_3["funder"] == 'Unicef')
                  | (df_3["funder"] == 'Tasaf')]
funder_10.head()

In [None]:
feature_w_hue(funder_10, col='funder', hue_col='target',rot=40,figsize=(14,5))

#### Does location impact functionality of a well
The below plot is really helpful because it shows where the wells are located and their functionality.  In the southeast section there is a cluster of non-functioning wells.  Further research is warranted.  Is there any correlation with district, population?

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x='longitude', y='latitude', hue='target', data=df_2)#, ax=ax
plt.title('Water Well Location and Functionality')
plt.legend();

#### Does region_code impact functionality?
It's interesting to see which districts have the highest number of wells and their status.  Region 
11 has highest number of wells and around 4,200 are functioning, around 1,000 are non-functioning and
around 100 are in eed of repairs.  Region 17 has 3,000 functioning wells and 1,500 are non-functioning.  
The plot gives us a quick summary of the number of wells in each district and their status.  It shoes
that regions 8,9,40, 60, 80, 90 and 99 have very few wells and most are non-functioning.

In [None]:
feature_w_hue(df_2, col='region_code', hue_col='target',rot=40,figsize=(14,5))

### Does the quality of the water impact functionality
The plot show that there are 6 kinds of wate quality.  The majority of the wells fall under the category of 
of good and around 39,000 are functioning.  Around 18,000 are non-functioning.  If the well good water quality there is a higher chance that it is functioning but there is still a high probability it's not working. 

In [None]:
feature_w_hue(df_2, col='quality_group', hue_col='target',rot=40,figsize=(14,5))

#### Does quantity of water impact well functionality?
The below plot shows that wells with enough water constitute the largest number of wells and the 
highest functionality.  There are around 24,000 functioning wells with enough water and with wells with 
enough water around 9,000 are non-functioning.  Based on the below it looks like if the well has a enought water 
it contributing to the functionality of the well.

In [None]:
feature_w_hue(df_2, col='quantity_group', hue_col='target',rot=40,figsize=(14,5))

In [None]:
sample = count_value_status(df_2, 'age').head(20)
sample

In [None]:
feature_w_hue(df_2, col='age', hue_col='target',rot=40,figsize=(14,5))

### Pre-processing of Data
The pre-processing of the data will involve One-Hot Encoding the features with object data types because
non-numeric values can't be inputted into the models.  In addition, this is a ternary classification problem because the target has three classifiction values: functional, non functional and functional needs repair. As displayed above we can see that value_counts are not balanced.  This could impact the accuracy of the model's performance.  I  ran the SMOTE method the training set to resample the set and get equal values for each category. 

In [None]:
cols_obj = ['funder','installer','basin','lga','scheme_management','extraction_type_class','management','payment','quality_group',
            'quantity_group','source','waterpoint_type']
cols_num = ['gps_height','latitude','longitude','district_code','region_code','permit','construction_year','population','age','pop_per_year']

df_3 = df_2[cols_obj]
df_4 = df_2[cols_num]

In [None]:
df_3.head()

In [None]:
df_4.describe()

#### One-Hot Encoding

In [None]:
one_hot_df = pd.get_dummies(df_3, cols_obj, drop_first=True)

In [None]:
one_hot_df.head()

In [None]:
one_hot_df.shape

In [None]:
result = pd.concat([df_4, one_hot_df], axis=1)
result.head()

In [None]:
result.shape

In [None]:
 from sklearn.model_selection import train_test_split

In [None]:
X = result.copy()
y = df_2['target']

In [None]:
from imblearn.over_sampling import SMOTE

#### Train/Test Split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.preprocessing import MinMaxScaler

#### SMOTE

In [None]:
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_sample(X_train, y_train)

In [None]:
print(X_train_resampled.shape)
print(y_train_resampled.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:

model_dict = {
    'KNN': { 'model' : KNeighborsClassifier(),
                         'params' : { 
                                        "n_neighbors" : [3,5,7,9,11,15,19],#,5,7,9,11,13,15
                                        "weights" : ['uniform', 'distance']}
             },
    'logisticR': { 'model' : LogisticRegression(),
                         'params' : { 
                             'fit_intercept':[False], 
                             'C':[1e12], 
                             'solver':['liblinear']}
             }
#     'random_forest': {
#         'model': RandomForestClassifier(),
#         'params': {
#              'bootstrap': [True],
#             #'max_depth': [80, 90],
#             'max_features': ['auto', 'sqrt'],
#             #'min_samples_leaf': [3, 4, 5],
#             'min_samples_split': [5, 10],
#             'n_estimators': [100,200],#[100, 200, 300,500]
#             # 'n_estimators': [100,150],
#             # #'max_depth': [int(x) for x in np.linspace(10, 15, num = 11)],
#             # 'max_features': ['auto', 'sqrt'],
#             #  'min_samples_split': [5,10],
#              'class_weight':['balanced'],
#               'criterion' :['gini', 'entropy']}
#              }


              
              }

In [None]:

scores = []
for model_name, mp in model_dict.items():
     clf = GridSearchCV(mp['model'], mp['params'],cv=5,  return_train_score=False) #refit='f1_weighted',
     clf.fit(X_train, y_train)
     y_pred = clf.predict(X_test) 
    
     accuracy = accuracy_score(y_test, y_pred)
     scores.append({
      'model':model_name,
      'accuracy': accuracy,
      'best params': clf.best_params_
      })
pd.set_option('display.max_colwidth', None)    
gscv_models = pd.DataFrame(scores) #, columns=['model', 'accuracy', 'f1', 'roc','best score','best params']
gscv_models

In [None]:
def eval_classification_model(X_train_resampled, X_test, y_train_resampled, y_pred,
                              normalize='true',cmap='Blues',figsize=[10,5]):
      
    # Classification Report / Accuracy Score 
    print(dashes)
    print("Classification Report")
    print(dashes)
    classes = ['0/Functional','1/Non-functioning','2/Needs Repair']
    print(metrics.classification_report(y_test,y_pred,target_names=classes))  
    print(dashes)
    print('\n')
   
    plt.figure(figsize=(8,8))
    rf_cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(rf_cm, annot=True, fmt="d", cmap='Blues')    

In [None]:
knn_params = gscv_models.iloc[0]['best params']
knn = KNeighborsClassifier(**knn_params)
model_k = knn.fit(X_train_resampled, y_train_resampled)
y_pred_k=model_k.predict(X_test)

In [None]:
model = eval_classification_model(X_train_resampled, X_test, 
                 y_train_resampled, y_pred_k)

In [None]:
logisticR_params = gscv_models.iloc[1]['best params']
logisticR = LogisticRegression(**logisticR_params)
model = logisticR.fit(X_train_resampled, y_train_resampled)
y_pred=model.predict(X_test)

In [None]:
model = eval_classification_model(X_train_resampled, X_test, 
                 y_train_resampled, y_pred)

In [None]:
# rf_params = gscv_models.iloc[2]['best params']
# rf = RandomForestClassifier(**rf_params)
# model_rf = rf.fit(X_train_resampled, y_train_resampled)
# y_pred_rf=model_rf.predict(X_test)

In [None]:
# model_rf = eval_classification_model(X_train_resampled, X_test, 
#                  y_train_resampled, y_pred_rf)

In [None]:
df_data = df_2.drop('target',axis=1)

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)

features = df_data.columns
plt.figure(figsize=(12,10))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()


In [None]:
#NO don't use
# param_grid = {'learning_rate': [0.075, 0.07],
#                       'max_depth': [6, 7],
#                       'min_samples_leaf': [7,8],
#                       'max_features': [1.0],
#                       'n_estimators':[100, 200]} 

# rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1,
#                             criterion= 'entropy',max_features= 'sqrt',
#                              min_samples_split= 10,class_weight='balanced')


In [None]:
#put is null values in unknown category
# df_2['installer'] = np.where(df_2['installer'].isnull(),"Unknown",df_2['installer'])

# df_2['installer'] = np.where(df_2['installer'].isnull(),"Unknown",df_2['installer'])

# amount_tsh_mean = df_2[df_2['amount_tsh']>0]['amount_tsh'].mean()
# df_2.loc[df_2['amount_tsh']==0, 'amount_tsh'] = int(amount_tsh_mean)

# #Matt Kirby
# features['construction_year'] = features['construction_year'].replace({0:1993})
# features['age'] = features['date_recorded'].astype(str).str[:4].astype(int) - features['construction_year']
# features['pop/year'] = features['population'].replace({0:1}) / features['age'].replace({0:1})

# features['water_/_person'] = features['amount_tsh'].replace({0:1}) / features['population'].replace({0:1})