In [None]:
import math
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
from collections import Counter
from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

### import  train,test, and label dataset

In [None]:
train = pd.read_csv("D:/Semester 7/Ml_Project/Training_set_values.csv")
test = pd.read_csv("D:/Semester 7/Ml_Project/Test_set_values.csv")
train_label = pd.read_csv("D:/Semester 7/Ml_Project/Training_set_labels.csv")

In [None]:
train.head(10)

### plot bar chart for label set

In [None]:
status_group = train_label['status_group'].value_counts().to_dict()

type_ = list(status_group.keys())
values = list(status_group.values())
  
fig = plt.figure(figsize = (10, 5))
 
# creating the bar plot
plt.bar(type_, values,
        width = 0.4)
 
plt.xlabel("status_group")
plt.ylabel("amount")
plt.show()

list_value = list(status_group.values())
list_value = [float(i)/sum(list_value) for i in list_value]
list_key = list(status_group.keys())
for i in range(len(list_value)):
    print(str(list_key[i])+": "+str(list_value[i]))

###### therefore dataset is imbalanced

### find duplicate rows


In [None]:
col_except_id = list(train.columns)[1:]

In [None]:
print("Number of duplicated rows in train set: " + str(train.duplicated(subset=col_except_id).sum()))

In [None]:
duplicate_indexes = train.loc[train.duplicated(subset=col_except_id), :]
duplicate_indexes = list(duplicate_indexes.index)
print(duplicate_indexes)

### removing identical rows

In [None]:
print("Number of rows in train set: "+ str(len(train.index)))
print("Number of rows in test set: "+ str(len(test.index)))
print("Number of rows in train labels: "+ str(len(train_label.index)))

In [None]:
train = train.drop_duplicates(subset=col_except_id)
train_label = train_label.drop(train_label.index[duplicate_indexes])

In [None]:
print("Number of rows in train set after removing duplicates: "+ str(len(train.index)))
print("Number of rows in train label set after removing duplicates: "+ str(len(train_label.index)))

### clarify whether identical rows are removed or not

In [None]:
print("Number of duplicated rows in train set: " + str(train.duplicated(subset=col_except_id).sum()))

### Find columns which contain NaN

In [None]:
train_col_with_NaN = train.columns[train.isna().any()].tolist()
test_col_with_NaN = test.columns[test.isna().any()].tolist()

print("columns which contain NaN in train set: " + str(train_col_with_NaN))
print("columns which contain NaN in test set: " + str(test_col_with_NaN))

### remove NaN values from identified columns using mode

In [None]:
for col in train_col_with_NaN:
    train[col].fillna(train[col].mode()[0], inplace = True)
    test[col].fillna(test[col].mode()[0], inplace=True)


### clarify whether NaN values are replced

In [None]:
train.columns[train.isna().any()].tolist()
test.columns[test.isna().any()].tolist()

In [None]:
train.head()

### Change values less than 25 in funder and installer column to 'others'

In [None]:
counts = train['funder'].value_counts().to_dict()
counts_ = train['installer'].value_counts().to_dict()
for key in counts:
    if (counts[key]<=25):
        train["funder"].replace({key: "others"}, inplace=True)

        
for key in counts_:
    if (counts_[key]<=25):
        train['installer'].replace({key: "others"}, inplace=True)

### Exatract month and year from "date_recorded" column and create new columns called month and year

In [None]:
train['month']=pd.to_datetime(train.date_recorded).dt.month
test['month']=pd.to_datetime(test.date_recorded).dt.month

train['year']=pd.to_datetime(train.date_recorded).dt.year
test['year']=pd.to_datetime(test.date_recorded).dt.year

train = train.drop(['date_recorded'], axis = 1)
test = test.drop(['date_recorded'], axis = 1)
col_except_id.remove('date_recorded')



In [None]:
train.head()

In [None]:
test.head()

### perform log normalization for population column

In [None]:
train.population = train.population.apply(lambda x: np.log10(x+1))
test.population = test.population.apply(lambda x: np.log10(x+1))

### convert 0 value in installer column to "others"

In [None]:
ins = train['installer'].unique()
ins = list(ins)
ins.sort()
print(ins)


In [None]:
train.installer.replace(to_replace=(None,'0'), value = "others", inplace = True)

In [None]:
train['construction_year']=pd.to_numeric(train['construction_year'])
test['construction_year']=pd.to_numeric(test['construction_year'])

### convert 0 value in funder column to "others"

In [None]:
funder = train['funder'].unique()
funder = list(funder)
funder.sort()
funder

In [None]:
train.funder.replace(to_replace=(None, '0'), value = "others", inplace = True)

In [None]:
train.loc[:, 'scheme_management'].value_counts(dropna=False)



In [None]:
train.scheme_management.replace(to_replace=('None'), value = "others", inplace = True)

In [None]:
train.loc[:, 'scheme_management'].value_counts(dropna=False)

In [None]:
test.info()

### calculate uncertainty coefficient for identify most realted feature

In [None]:
def conditional_entropy(x,y):
    # entropy of x given y
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x,y)))
    total_occurrences = sum(y_counter.values())
    entropy = 0
    for xy in xy_counter.keys():
        p_xy = xy_counter[xy] / total_occurrences
        p_y = y_counter[xy[1]] / total_occurrences
        entropy += p_xy * math.log(p_y/p_xy)
    return entropy

def theil_u(x,y):
    s_xy = conditional_entropy(x,y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

In [None]:
theilu = pd.DataFrame(index=['class'],columns=train.columns)
columns = train.columns
for j in range(0,len(columns)):
    u = theil_u(train_label['status_group'].tolist(),train[columns[j]].tolist())
    theilu.loc[:,columns[j]] = u
theilu.fillna(value=np.nan,inplace=True)
plt.figure(figsize=(20,1))
sns.heatmap(theilu,annot=True,fmt='.2f')
plt.show()

### remove less related columns from train and test sets

In [None]:
# delete_columns = ['region', 'recorded_by', 'extraction_type_class', 'management_group', 'payment', 'quality_group', 'source_type', 'source_class', 'waterpoint_type_group', 'payment_type', 'quantity_group']

# delete_columns = ['date_recorded','wpt_name','num_private','subvillage','lga','ward','recorded_by','extraction_type_group','extraction_type','scheme_name','management','waterpoint_type_group','source','source_class','quantity_group','quality_group','payment_type']

# for ele in delete_columns:
    
#     print(ele+": "+str(train[ele].isnull().values.any()))
#     train = train.drop([ele], axis=1)
#     test = test.drop([ele], axis=1)
    
# train.head()

### Label set encoding

In [None]:
data_classes = ['non functional', 'functional needs repair', 'functional']
encoded_label = dict(zip(data_classes, range(0,3,1)))
train_label['status_group'] = train_label['status_group'].map(encoded_label, na_action='ignore')


In [None]:
train_label.head()

### Do target encoding for non-numerical columns

In [None]:
from category_encoders import TargetEncoder
from category_encoders import LeaveOneOutEncoder


## create duplicate dataframe of train and test for catboost classification
train_ = train
test_ = test

train_encode = train_.sample(frac=0.25, random_state=42)
train_label_encode = train_label.loc[train_encode.index]

column_with_str = []
for col in col_except_id:
    if (type(train[col][0]) is str ):
        column_with_str.append(col)

# encoder = LeaveOneOutEncoder(sigma=0.05, random_state=42)
encoder = TargetEncoder()
    
encoder = encoder.fit(train_encode, train_label_encode['status_group'])
train_ = encoder.transform(train_)
test_ = encoder.transform(test_)

In [None]:
train_.head()

In [None]:
test_.head()

### convert boolean value columns to 0's and 1's

In [None]:
train_.permit = train_.permit.astype(bool).astype(int)
train_.public_meeting = train_.public_meeting.astype(bool).astype(int)


### Perform onehot encoding for permit and public_meeting

In [None]:
train_ = pd.get_dummies(train_, columns = ['permit', 'public_meeting'])

In [None]:
train_.head()

### removing 'id' column from both train_ and train_label set

In [None]:
train_label = train_label.drop(['id'], axis=1)
train_ = train_.drop(['id'], axis=1)

### use SMOTE to handle imblanced train dataset

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE(sampling_strategy = 'auto', n_jobs = -1, random_state=42)
X, y = oversample.fit_resample(train_, train_label)

### split train data set to X_train and X_test 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, stratify= y)

### Normalization and classify using XGB classifier

In [None]:
model = XGBClassifier(objective = 'multi:softmax', booster = 'gbtree', nrounds = 'min.error.idx',
                      num_class = 3, maximize = False, eval_metric = 'merror', eta = .1,
                      max_depth = 16, colsample_bytree = .4, n_jobs = -1, random_state=42)

X_train = ((X_train-X_train.min())/(X_train.max()-X_train.min()))
X_test = ((X_test-X_test.min())/(X_test.max()-X_test.min()))

model.fit(X_train, y_train)

### predict using X_test

In [None]:
predictions = model.predict(X_test)

### evaluate predictions

In [None]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### plot confusion matrix

In [None]:
confusion_matrix = confusion_matrix(y_test['status_group'], predictions)
plot_confusion_matrix(confusion_matrix)
plt.show()

# y_test.head()
# predictions

### K-fold cross validation

In [None]:
cv = KFold(n_splits=10, random_state=42, shuffle=True)
scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

### convert boolean values in public_meeting and permit of test set to 0's and 1's 

In [None]:
test_.permit = test_.permit.astype(bool).astype(int)
test_.public_meeting = test_.public_meeting.astype(bool).astype(int)

t = test_['id']
test_ = test_.drop(["id"], axis = 1)

### onehot encoding for permit and public_meeting of test_ set

In [None]:
test_ = pd.get_dummies(test_, columns = ['permit', 'public_meeting'])

In [None]:
test_.head()

### predict values for  test dataset

In [None]:
test_=((test_- test_.min())/(test_.max() - test_.min()))
test_predictions = model.predict(test_)

### create csv file for output predictions 

In [None]:
test_predictions = test_predictions.tolist()
list_to_csv = []
t = list(t)
for i  in range(len(t)):
    if (test_predictions[i] == 0):
        list_to_csv.append([t[i], 'non functional'])
    elif (test_predictions[i] == 1):
        list_to_csv.append([t[i], 'functional needs repair'])
    else:
        list_to_csv.append([t[i], 'functional'])

to_csv = pd.DataFrame(list_to_csv, columns=['id', 'status_group'])
to_csv.to_csv('submission.csv', index = False)

In [None]:
print(len(to_csv.index))

In [None]:
X_train.head()

In [None]:
X_test.head()


### CatBoost classification

In [None]:
cat_features = []
for ele in train.columns:
    if(type(train[ele][0]) is str):
        cat_features.append(ele)

In [None]:
train.permit = train.permit.astype(bool).astype(int)
train.public_meeting = train.public_meeting.astype(bool).astype(int)
train = pd.get_dummies(train, columns = ['permit', 'public_meeting'])

test.permit = test.permit.astype(bool).astype(int)
test.public_meeting = test.public_meeting.astype(bool).astype(int)
test = pd.get_dummies(test, columns = ['permit', 'public_meeting'])

In [None]:
# train_label = train_label.drop(['id'], axis=1)
train = train.drop(['id'], axis=1)


t = test['id']
test = test.drop(["id"], axis = 1)


In [None]:
X_train_catboost, X_test_catboost, y_train_catboost, y_test_catboost = train_test_split(train,train_label, test_size=0.2, stratify= train_label)


In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
     max_ctr_complexity=5,
     task_type = 'CPU',
    iterations=10000,
    eval_metric='AUC',
    od_type='Iter',
    od_wait=500,
    cat_features = cat_features,
    verbose=False
    
)


clf.fit(X_train_catboost, y_train_catboost, 
        cat_features=cat_features, 
        eval_set=(X_test_catboost, y_test_catboost), 
        verbose=False
)

In [None]:
pred = clf.predict(data=X_test_catboost)

In [None]:
accuracy = accuracy_score(y_test_catboost, pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
testpredictions = clf.predict(test)
# train.head()

In [None]:
test_predictions = testpredictions.tolist()
list_to_csv = []
t = list(t)
for i  in range(len(t)):
    if (test_predictions[i] == 0):
        list_to_csv.append([t[i], 'non functional'])
    elif (test_predictions[i] == 1):
        list_to_csv.append([t[i], 'functional needs repair'])
    else:
        list_to_csv.append([t[i], 'functional'])

to_csv = pd.DataFrame(list_to_csv, columns=['id', 'status_group'])
to_csv.to_csv('submission_catboost.csv', index = False)

In [None]:
train_label.head()