**Import Important Libraries**

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
#import scipy.sparse.linalg
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [6]:
#Warnings - Ignore All
import warnings
warnings.simplefilter('ignore')

In [7]:
#Ignore All Warnings About Later Execution
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

**Data Ingestion**

In [8]:
train_values = pd.read_csv('../data/raw/Training set values.csv')
train_labels = pd.read_csv('../data/raw/Training set labels.csv')
test_values  = pd.read_csv('../data/raw/Test set values.csv')

***Data Merge***

In [9]:
train_data_complete = pd.merge(train_values, train_labels, left_on='id', right_on='id', how='left').drop('id', axis=1)

**Data Cleaning**

In [None]:
# #Drop Redundant Columns
# features_to_drop = ['date_recorded']

In [None]:
#  train_data_v1 = train_data_complete.drop(features_to_drop, axis=1) 

***Missing Data Imputation***

In [10]:
def print_missing_values(train_data, test_data=None):
    """
    Prints the columns with the number and percentage of missing values in the given datasets.

    Parameters:
    train_data (pd.DataFrame): the training data
    test_data (pd.DataFrame): the test data (optional)

    Returns:
    None
    """

    print("TRAIN DATA:")

    # Compute number and percentage of missing values for train data
    missing_values_train = train_data.isnull().sum().sort_values(ascending=False)
    percent_missing_train = 100 * train_data.isnull().sum().sort_values(ascending=False) / len(train_data)

    # Create a new dataframe with the number and percentage of missing values for train data
    missing_data_train = pd.concat([missing_values_train, percent_missing_train], axis=1, keys=['Missing Values', '% Missing'])

    # Print the columns with missing values for train data
    print(missing_data_train[missing_data_train['Missing Values'] > 0])

    # If test data is provided, compute and print the number and percentage of missing values for test data
    if test_data is not None:
        print("\nTEST DATA:")

        missing_values_test = test_data.isnull().sum().sort_values(ascending=False)
        percent_missing_test = 100 * test_data.isnull().sum().sort_values(ascending=False) / len(test_data)

        missing_data_test = pd.concat([missing_values_test, percent_missing_test], axis=1, keys=['Missing Values', '% Missing'])

        print(missing_data_test[missing_data_test['Missing Values'] > 0])


In [11]:
print_missing_values(train_values, test_values)

TRAIN DATA:
                   Missing Values  % Missing
scheme_name                 28166  47.417508
scheme_management            3877   6.526936
installer                    3655   6.153199
funder                       3635   6.119529
public_meeting               3334   5.612795
permit                       3056   5.144781
subvillage                    371   0.624579

TEST DATA:
                   Missing Values  % Missing
scheme_name                  7092  47.757576
scheme_management             969   6.525253
installer                     877   5.905724
funder                        869   5.851852
public_meeting                821   5.528620
permit                        737   4.962963
subvillage                     99   0.666667


In [13]:
def delete_missing_value_columns(train_df, test_df=None, threshold=0.7):
    """
    Delete columns from train and test DataFrames if the missing value percentage is greater than a threshold.

    Parameters:
    train_df (pandas.DataFrame): The training DataFrame to delete columns from.
    test_df (pandas.DataFrame, optional): The test DataFrame to delete columns from. Default is None.
    threshold (float, optional): The maximum missing value percentage allowed before a column is deleted.
                                 Default is 0.7.

    Returns:
    tuple: A tuple containing the cleaned training DataFrame and test DataFrame (if provided).
    """
    # Calculate the missing value percentage for each column in the training data
    train_missing_values = train_df.isnull().mean()

    # Identify the columns where the missing value percentage is greater than the threshold
    train_columns_to_delete = train_missing_values[train_missing_values > threshold].index.tolist()

    # Delete the identified columns from the training data
    train_df = train_df.drop(train_columns_to_delete, axis=1)

    # Delete the same columns from the test data, if provided
    if test_df is not None:
        test_columns_to_delete = list(set(train_columns_to_delete) & set(test_df.columns.tolist()))
        test_df = test_df.drop(test_columns_to_delete, axis=1)

    # Return the cleaned data
    if test_df is not None:
        return train_df, test_df
    else:
        return train_df


In [14]:
train_values_v1, test_values_v1 = delete_missing_value_columns(train_values, test_values)

In [15]:
def impute_missing_values(train_data, test_data=None):
    """
    Impute missing values in the continuous and categorical columns of the given datasets using SimpleImputer.

    Parameters:
    train_data (pd.DataFrame): the training data to fit the imputers on
    test_data (pd.DataFrame): the test data to transform with the fitted imputers (optional)

    Returns:
    pd.DataFrame: the transformed training data
    pd.DataFrame: the transformed test data (if test_data is not None)
    """

    # Get continuous and categorical columns
    continuous_columns = train_data.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = train_data.select_dtypes(include=['object', 'category']).columns.tolist()

    # Create imputers for continuous and categorical columns
    continuous_imputer = SimpleImputer(strategy='mean')
    categorical_imputer = SimpleImputer(strategy='most_frequent')

    # Fit and transform the imputers on train data
    if len(continuous_columns) > 0:
        train_data[continuous_columns] = continuous_imputer.fit_transform(train_data[continuous_columns])

    if len(categorical_columns) > 0:
        train_data[categorical_columns] = categorical_imputer.fit_transform(train_data[categorical_columns])

    # Transform the imputers on test data
    if test_data is not None:
        if len(continuous_columns) > 0:
            test_data[continuous_columns] = continuous_imputer.transform(test_data[continuous_columns])

        if len(categorical_columns) > 0:
            test_data[categorical_columns] = categorical_imputer.transform(test_data[categorical_columns])

    # Return transformed train and test data (if applicable)
    if test_data is not None:
        return train_data, test_data
    else:
        return train_data


In [16]:
train_values_v2, test_values_v2 = delete_missing_value_columns(train_values_v1, test_values_v1)

**Data Preprocessing**

***Transforming Data***

In [17]:
class FrequencyEncoder():
    """
    FrequencyEncoder
    Conversion of category into frequencies.
    Parameters
        ----------
    cols : list of categorical features.
    drop_invariant : not used
    """
    def __init__(self, cols=None, drop_invariant=None):
        """
        Description of __init__
        Args:
            cols=None (undefined): columns in dataset
            drop_invariant=None (undefined): not used
        """
        self.cols = cols
        self.counts_dict = None

    def fit(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """
        Description of fit
        Args:
            X (pd.DataFrame): dataset
            y=None (not used): not used
        Returns:
            pd.DataFrame
        """
        counts_dict = {}
        if self.cols is None:
            self.cols = X.columns
        for col in self.cols:
            values = X[col].value_counts(dropna=False).index
            n_obs = float(len(X))
            counts = list(X[col].value_counts(dropna=False) / n_obs)
            counts_dict[col] = dict(zip(values, counts))
        self.counts_dict = counts_dict

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Description of transform
        Args:
            X (pd.DataFrame): dataset
        Returns:
            pd.DataFrame
        """
        counts_dict_test = {}
        res = []
        for col in self.cols:
            values = X[col].value_counts(1,dropna=False).index.tolist()
            counts = X[col].value_counts(1,dropna=False).values.tolist()
            counts_dict_test[col] = dict(zip(values, counts))

            # if value is in "train" keys - replace "test" counts with "train" counts
            for k in [
                key
                for key in counts_dict_test[col].keys()
                if key in self.counts_dict[col].keys()
            ]:
                counts_dict_test[col][k] = self.counts_dict[col][k]
            res.append(X[col].map(counts_dict_test[col]).values.reshape(-1, 1))
        try:
            res = np.hstack(res)
        except:
            pdb.set_trace()
        X[self.cols] = res
        return X

    def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
        """
        Description of fit_transform
        Args:
            X (pd.DataFrame): dataset
            y=None (undefined): not used
        Returns:
            pd.DataFrame
        """
        self.fit(X, y)
        X = self.transform(X)
        return X

In [18]:
def transform_data(train_data, test_data):
    
    # Separate continuous and categorical columns
    continuous_features = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_features = train_data.select_dtypes(include=['object']).columns.tolist()
    
    # Scale continuous features
    scaler = StandardScaler()
    train_data[continuous_features] = scaler.fit_transform(train_data[continuous_features])
    test_data[continuous_features] = scaler.transform(test_data[continuous_features])


    encoder = FrequencyEncoder()
    train_data[categorical_features] = encoder.fit_transform(train_data[categorical_features])
    test_data[categorical_features] = encoder.transform(test_data[categorical_features])

    return train_data, test_data


In [19]:
train_values_v3, test_values_v3 = transform_data(train_values_v2, test_values_v2)

***Feature Selection***

In [21]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

def select_features(X, y, num_features):
    """
    Select the top 'num_features' features based on mutual information with the target variable.
    X: pandas dataframe of features, can include continuous and categorical variables.
    y: pandas series of target variable.
    num_features: number of features to select.
    """
    # Encode categorical variables if any
    label_encoder = LabelEncoder()
    for col in X.select_dtypes(include=['object']):
        X[col] = label_encoder.fit_transform(X[col])

    # Compute mutual information scores for each feature
    mi_scores = mutual_info_classif(X, y, random_state=42)

    # Combine scores with feature names
    feature_scores = dict(zip(X.columns, mi_scores))

    # Select top features based on scores
    top_features = sorted(feature_scores, key=feature_scores.get, reverse=True)[:num_features]

    return top_features

In [22]:
top_features = select_features(train_values_v3,train_labels['status_group'], 10)

In [23]:
train_values_v4, test_values_v4 = train_values_v3[top_features], test_values_v3[top_features]

***Feature Engineering***

**Modeling**

In [24]:
X = train_values_v4

In [25]:
le = LabelEncoder()

Y  = le.fit_transform(train_labels['status_group'])

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size = 0.3, random_state = 42)

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
classifier = KNeighborsClassifier(n_neighbors = 14)
classifier.fit(X_train, y_train)

In [29]:
test_predictions = classifier.predict(test_values_v4)

In [30]:
submission_df = pd.DataFrame({'id': test_values['id'], 'target': test_predictions})

In [31]:
submission_df['status_group'] = submission_df['target'].map({0:'functional', 1:'non functional',2:'functional needs repair'})

In [32]:
submission_df.to_csv('submission_v2.csv', index=False)