In [11]:
# TASK 2B
# Name: Radiance Tan
# Student ID: 10961112
##################################################################################################################
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import graphviz
import sklearn
import re

from sklearn.cluster import KMeans
from sklearn import tree, neighbors
from sklearn.decomposition import PCA
from sklearn.tree import export_graphviz
from itertools import groupby, combinations
from sklearn.metrics import accuracy_score as ACS
from sklearn.pipeline import make_pipeline as PIPE
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.preprocessing import StandardScaler as SS
from sklearn.model_selection import train_test_split as TTS


'''----------- 'block_list' blocks country code into groups from A-Z and puts them into a dictionary -----------'''
def block_list(mylist):
    mydict={}
    for k, g in itertools.groupby(mylist, lambda x: x[1][0]):
        if k in mydict:
            mydict[k] += g
        else:
            mydict[k]=list(g)
    return mydict



'''-------------------------- code below links the 2 csvs to produce a csv 'my_world' --------------------------'''
life_df = pd.read_csv('life.csv', encoding = 'ISO-8859-1', usecols = ['Country', 'Country Code', 'Life expectancy at birth (years)'])
world_df = pd.read_csv('world.csv', encoding = 'ISO-8859-1')
world_df.dropna(subset=['Country Name'], inplace=True)
world_df.drop(columns=['Time'], inplace=True)

sort_life = sorted(life_df.values.tolist(), key=lambda x: x[1])
sort_world = sorted(world_df.values.tolist(), key=lambda x: x[1])

life_dict = block_list(sort_life)
world_dict = block_list(sort_world)

myworld = []
life = ['Low', 'Medium', 'High']

for k,v in world_dict.items():
    match_val = life_dict.get(k)
    
    for row in v:
        if not re.search(r"[&(:]|dividend|income|euro|small|ida |ibrd|oecd|conflict|not |world", (row[0]).lower()):
            if match_val:
                expectant_life = [z[2] for z in match_val if z[1]==row[1]]
                if expectant_life != []:
                    expectant_life = [life.index(expectant_life[0])]
                    myworld.append(row[0:2] + expectant_life + row[2:])
                elif expectant_life == [] and row.count('..') <= 14:
                    myworld.append(row[0:2] + ['..'] + row[2:])
        

# feature names have been shortened for easy reference in the code
features = ['[EG.ELC.ACCS.RU.ZS]', '[NY.ADJ.DPEM.GN.ZS]', '[SP.DYN.CBRT.IN]', '[SH.DTH.COMM.ZS]', '[SH.DTH.NCOM.ZS]', 
            '[SH.XPD.GHED.PC.CD]', '[IT.NET.USER.ZS]', '[SH.MMR.RISK.ZS]', '[SH.MMR.RISK]', '[SH.STA.MMRT]', '[SH.DYN.NCOM.FE.ZS]', 
            '[SH.STA.AIRP.P5]', '[SH.STA.AIRP.FE.P5]', '[SH.STA.AIRP.MA.P5]', '[SH.STA.POIS.P5.FE]', '[SH.STA.WASH.P5]', '[SH.H2O.BASW.ZS]', 
            '[SH.STA.BASS.ZS]', '[SH.STA.BASS.UR.ZS]', '[SH.ANM.CHLD.ZS]']


col = ['CountryName', 'CountryCode', 'LifeExpectancy']+features
myworld_df = pd.DataFrame(myworld, columns=col)
myworld_df = myworld_df.replace('..', np.nan)
myworld_df = myworld_df.fillna(myworld_df.median())
myworld_df[features] = myworld_df[features].astype('float')


new_combi = []
combination = list(combinations(features, 2))
for combi in combination:
    name1 = (re.search(r"\[\S+\]", combi[0])).group()
    name2 = (re.search(r"\[\S+\]", combi [1])).group()
    myworld_df[name1+' x '+name2] = myworld_df[combi[0]] * myworld_df[combi[1]]
    new_combi.append(name1+' x '+name2)


'''------------------- finding optimal k for n_clusters and features for feature engineering -------------------'''
new_features = myworld_df[features+new_combi]
cluster_no = {} 
inertia = []

for k in range(1,15): 
    model = KMeans(n_clusters=k).fit(myworld_df[features]) 
    model.fit(myworld_df[features])     
    inertia.append(model.inertia_) 
    cluster_no[k] = model.inertia_ 

    
plt.figure()
plt.plot(range(1,15), inertia, 'bx-') 
plt.xlabel('Number of clusters') 
plt.ylabel('Inertia')
plt.savefig('task2b_cluster_number.png')



# replicating decision tree in task 2A. an error might occur here if graphviz is not installed to view the image
data = myworld_df[features]
X_train, X_test, y_train, y_test = TTS(data, myworld_df['LifeExpectancy'], train_size=0.66, test_size=0.34, random_state=100)

scaler = SS().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
dt = DTC(random_state=100, max_depth=5)
dt.fit(X_train, y_train)
y_pred=dt.predict(X_test)

tree.export_graphviz(dt,out_file="tree.dot",feature_names=data.columns, filled=True)
#! dot -Tpng tree.dot -o task2b_feature_selection.png




'''------------------------------- creating cluster labels and feature engineering -------------------------------'''
kmeans = KMeans(n_clusters=4, random_state=100)
kmeans.fit(myworld_df[features])
myworld_df['clusterlabel'] = kmeans.predict(myworld_df[features])


classlabel = myworld_df['LifeExpectancy']
data1 = myworld_df[['clusterlabel', '[SH.MMR.RISK]', '[SH.STA.WASH.P5]', '[SH.DYN.NCOM.FE.ZS]']]

X_train1, X_test1, y_train1, y_test1 = TTS(data1, classlabel, train_size=0.66, test_size=0.34, random_state=100)
scaler1 = SS().fit(X_train1)
X_train1 = scaler1.transform(X_train1)
X_test1 = scaler1.transform(X_test1)

knn1 = neighbors.KNeighborsClassifier(n_neighbors=5)
knn1.fit(X_train1, y_train1)
y_pred1 = knn1.predict(X_test1)
print("Accuracy of feature engineering: {:.3f}%".format(ACS(y_test1, y_pred1)*100))


'''---------------------------------------------------- PCA ----------------------------------------------------'''
data = myworld_df.iloc[:, 3:]
X_train2, X_test2, y_train2, y_test2 = TTS(data, classlabel, train_size=0.66, test_size=0.34, random_state=100)

pca = PIPE(SS(), PCA(n_components=4, random_state=100))
pca.fit(X_train2)
X_train2 = pca.transform(X_train2)
X_test2 = pca.transform(X_test2) 

knn2 = neighbors.KNeighborsClassifier(n_neighbors=5)
knn2.fit(X_train2, y_train2)
y_pred2 = knn2.predict(X_test2)
print("Accuracy of PCA: {:.3f}%".format(ACS(y_test2, y_pred2)*100))


'''--------------------------------------------- first 4 features ----------------------------------------------'''
data3 = myworld_df[["[EG.ELC.ACCS.RU.ZS]","[NY.ADJ.DPEM.GN.ZS]", "[SP.DYN.CBRT.IN]","[SH.DTH.COMM.ZS]"]]

X_train3, X_test3, y_train3, y_test3 = TTS(data3, classlabel, train_size=0.66, test_size=0.34, random_state=100)
scaler3 = SS().fit(X_train3)
X_train3 = scaler3.transform(X_train3)
X_test3 = scaler3.transform(X_test3)

knn3 = neighbors.KNeighborsClassifier(n_neighbors=5)
knn3.fit(X_train3, y_train3)
y_pred3 = knn3.predict(X_test3)
print("Accuracy of first four features: {:.3f}%".format(ACS(y_test3, y_pred3)*100))


# used to check accuracies of train set
# print("Accuracy of feature engineering (train set): {:.1f}%".format(ACS(y_train1, knn1.predict(X_train1))*100))
# print("Accuracy of PCA (train set): {:.1f}%".format(ACS(y_train2, knn2.predict(X_train2))*100))
# print("Accuracy of first 4 features (train set): {:.1f}%".format(ACS(y_train3, knn3.predict(X_train3))*100))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').