# Predicting the condition of water wells in Tanzania

<img src="./images/water_fill.jpg" 
     align="left" 
     width="400" />

### Data Preparation

In [None]:
#Imports
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from scipy.stats import chi2_contingency
from scipy.stats import chi2

In [None]:
#Loading data
test_set_values = pd.read_csv('Data/test_set_values.csv')
training_set_labels = pd.read_csv('Data/training_set_labels.csv')
training_set_values = pd.read_csv('Data/training_set_values.csv')

#### Merging training set values and training set labels 

In [3]:
#inner merge
training_all = pd.merge(training_set_values, training_set_labels, on='id', how='inner')

NameError: name 'pd' is not defined

In [4]:
training_all.info()

NameError: name 'training_all' is not defined

### Data Cleaning - dealing with N/A values and removing unnecessary data columns

In [None]:
training_all.head()

In [None]:
#Dropping the num_private column because there is no data on what this column means on the website source of the data
training_all.drop(columns='num_private', inplace=True)

In [None]:
# Replace specific values in the 'installer' column
training_all['installer'] = training_all['installer'].replace('Hesawa', 'HESAWA')
training_all['installer'] = training_all['installer'].replace('DANID', 'DANIDA')
training_all['installer'] = training_all['installer'].replace('Commu', "Community")

In [None]:
#changing the name of some the columns so they are more intuitive
training_all.rename(columns={'gps_height': 'well_altitude'}, inplace=True)
training_all.rename(columns={'wpt_name': 'water_point_name'}, inplace=True)
training_all.rename(columns={'payment': 'payment_type'}, inplace=True)
training_all.rename(columns={'payment_type': 'frequency_of_payment'}, inplace=True)
training_all.rename(columns={'funder': 'funding_source'}, inplace=True)

In [None]:
# Changing three options for functioning status of wells to two options - functional or needing repair
to_replace = ['functional needs repair']
new_value = 'functional'
training_all['status_group'] = training_all['status_group'].replace(to_replace, new_value)

In [None]:
training_all.dropna(subset=['latitude'], inplace=True)
training_all.dropna(subset=['longitude'], inplace=True)
training_all = training_all.loc[training_all['longitude'] != 0]
training_all = training_all.loc[training_all['latitude'] != 0]

## All Data Clean

### Construction year

In [None]:
#Drop rows with construction year of 0
training_all.drop(training_all[training_all['construction_year'] == 0].index, inplace=True)
training_all['construction_year'].value_counts()

### Installer

In [6]:
#For installer, filter out those with less than 500 count

In [None]:
# Calculate installer counts
installer_counts = training_all['installer'].value_counts()

# Identify installers with counts >= 500
installers_to_keep = installer_counts[installer_counts >= 500].index

# Filter the DataFrame based on installers to keep
training_all = training_all[training_all['installer'].isin(installers_to_keep)]

In [None]:
training_all.head()

### Scheme management

In [None]:
# drop n/a
training_all.dropna(subset=['scheme_management'], inplace=True)

In [None]:
# drop "None" and "Other" values
training_all = training_all.loc[training_all['scheme_management'] != 'None']
training_all = training_all.loc[training_all['scheme_management'] != 'Other']

### Extraction type

In [None]:
training_all['extraction_type'].value_counts()

In [7]:
# overwhelming majority of pumps have gravity as extraction type, so maybe not worth it to filter

In [None]:
# Extraction type class makes more sense, seems more organized into extraction types that could be compared, 
# but still most pumps use gravity extraction method
training_all['extraction_type_class'].value_counts()

### Management or management group

In [None]:
training_all['management'].value_counts()

In [None]:
training_all['management_group'].value_counts()

### Quantity

In [None]:
training_all['quantity'].value_counts()

In [None]:
training_all['quantity_group'].value_counts()

In [None]:
# drop quantity_group column because it is a duplicate of quantity column 
training_all.drop(columns='quantity_group', inplace=True)

In [None]:
# drop unknown values from quantity column
training_all = training_all.loc[training_all['quantity'] != 'unknown']

### Source

In [None]:
training_all['source'].value_counts()

In [None]:
# drop unknown values from quantity column
training_all = training_all.loc[training_all['source'] != 'unknown']
training_all = training_all.loc[training_all['source'] != 'other']

### Waterpoint type

In [None]:
training_all['waterpoint_type'].value_counts()

In [None]:
training_all['waterpoint_type_group'].value_counts()

In [None]:
training_all['status_group'].value_counts()

## Preliminary analysis

### Installer

In [None]:
# Replace 'installer' with the actual column name
installer_counts = training_all['installer'].value_counts()

# Get installer types with less than 2 instances
installers_to_remove = installer_counts[installer_counts < 500].index

# Filter the dataset to exclude rows with those installer types
installer_filtered = training_all[~training_all['installer'].isin(installers_to_remove)]

In [None]:
installer_filtered = installer_filtered.loc[installer_filtered['installer'] != '0']

In [None]:
installer_filtered['installer'].value_counts()

In [None]:
installer_dummy = pd.get_dummies(installer_filtered, columns=['installer'], prefix='installer')
installer_dummy.head()

In [None]:
# model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Split the dataset into features (X) and target (y)
X = installer_dummy[['installer_CES', 'installer_Central government', 'installer_Community', 'installer_DANIDA', 'installer_DWE', 'installer_District Council', 'installer_Government', 'installer_HESAWA', 'installer_KKKT', 'installer_RWE', 'installer_TCRS']]  # Drop the target column
y = installer_dummy['status_group']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

## Construction year

In [None]:
year = training_all[['status_group','construction_year']]

In [None]:
year.drop(year[year['construction_year'] == 0].index, inplace=True)

In [None]:
year['construction_year'].value_counts()

In [None]:
year['construction_year'].describe()

In [None]:
# Assuming your DataFrame is named 'data' and the column to convert is 'status_group'
year['status_group'] = year['status_group'].map({'functional': 1, 'non functional': 0})

In [None]:
import statsmodels.api as sm

# Assuming you have a DataFrame named 'data' with columns 'year' and 'status_group'
# 'status_group' should be encoded as 0 (not functional) and 1 (functional)

# Create a constant term to include in the model
year['constant'] = 1

# Define the independent variable (X) and dependent variable (y)
X = year[['constant', 'construction_year']]
y = year['status_group']

# Fit the logistic regression model
model = sm.Logit(y, X).fit()

# Print the summary of the regression
print(model.summary())

In [None]:
predicted_values = model.predict(X)
residuals = y - predicted_values

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(X['construction_year'], residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')  # Add a horizontal line at y=0
plt.xlabel('Year')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

In [None]:
# Generate a range of years for prediction
years_for_prediction = np.arange(year['construction_year'].min(), year['construction_year'].max() + 1)

# Create a new DataFrame for prediction
prediction_data = pd.DataFrame({'constant': 1, '_construction_year': years_for_prediction})

# Calculate predicted probabilities for each year
predicted_probs = model.predict(prediction_data)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(years_for_prediction, predicted_probs, marker='o')
plt.xlabel('Year')
plt.ylabel('Predicted Probability (status_group=0)')
plt.title('Predicted Probability of Non-Functional Water Point')
plt.show()

### Chi squared test

In [None]:
from scipy.stats import chi2_contingency
from scipy.stats import chi2
alpha = 0.05
# Calculate the degrees of freedom for the Chi-squared test
degrees_of_freedom = (contingency_table.shape[0] - 1) * (contingency_table.shape[1] - 1)

critical_value = chi2.ppf(1 - alpha, degrees_of_freedom)
# List of categorical variables to test
categorical_vars = ['waterpoint_type', 'installer', 'scheme_management', 'construction_year', 'extraction_type', 'management', 'quantity', 'source']

for var in categorical_vars:
    contingency_table = pd.crosstab(training_all[var], training_all['status_group'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    
    if chi2 > critical_value:
        print(f"{var} and target are dependent")
    else:
        print(f"{var} and target are independent")