# Tugas Besar 2 Intelijensi Buatan

## Prediksi Income per Tahun

#### Anggota Kelompok:
- Devin Alvaro / 13515062
- Stevanno Hero Leadervand / 13515082
- Rizki Ihza / 13515104
- Gianfranco Fertino Hwandiano / 13515118

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing, neighbors, tree
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

%matplotlib inline

### Membaca dataset

In [None]:
main_df = pd.read_csv("data/CencusIncome.data.txt", header = None)

# name columns
main_df = main_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'})

# remove rows with null values
main_df = main_df[main_df['workclass'] != '?']
main_df = main_df[main_df['marital-status'] != '?']
main_df = main_df[main_df['native-country'] != '?']

# remove 'fnlwgt' column
main_df = main_df.drop(['fnlwgt'], axis=1)

main_df.head(100)

### *Preprocessing* dataset

In [None]:
# le = preprocessing.LabelEncoder()

# for column in main_df.columns.values:
#     le.fit(main_df[column])
#     main_df[column] = le.transform(main_df[column])

# y = np.array(main_df['label'])
# x = np.array(main_df.drop(['label'], 1))

In [None]:
main_df = pd.get_dummies(main_df)

y = np.array(main_df['label_<=50K'], main_df['label_>50K'])
x = np.array(main_df.drop(['label_<=50K', 'label_>50K'], 1))

main_df.head(100)

## Eksperimen untuk mendapatkan model terbaik

### Naive Bayes

In [None]:
gnb = GaussianNB()

score = cross_val_score(gnb, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### Decision Tree

In [None]:
ID3learn = tree.DecisionTreeClassifier(criterion="entropy")

score = cross_val_score(ID3learn, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### k-Nearest Neighbors

In [None]:
n_neighbors = 61

KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')

score = cross_val_score(KNNlearn, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### Multilayer Perceptron

In [None]:
MLPlearn = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5, 2))

score = cross_val_score(MLPlearn, x, y, cv=10)

for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.2f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### Memilih model terbaik

In [None]:
KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
KNNlearn.fit(x, y)

### Menyimpan model

In [None]:
joblib.dump(KNNlearn, 'model/KNN.pkl')

### *Loading* model

In [None]:
KNNlearn = joblib.load('model/KNN.pkl')

## Evaluasi dan prediksi dengan model terpilih

### Membaca test dataset

In [None]:
test_df = pd.read_csv("data/CencusIncome.test.txt", header=None, skiprows=1)
test_df = test_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'})
test_df = test_df.drop(['fnlwgt'], axis=1)

### *Preprocessing* test dataset

In [None]:
# le = preprocessing.LabelEncoder()

# for column in main_df.columns.values:
#     le.fit(test_df[column])
#     test_df[column] = le.transform(test_df[column])

# y = np.array(test_df['label'])
# x = np.array(test_df.drop(['label'],1))

In [None]:
test_df = pd.get_dummies(test_df)

y = np.array(test_df['label_<=50K'], test_df['label_>50K'])
x = np.array(test_df.drop(['label_<=50K', 'label_>50K'], 1))

test_df.head()

### Hasil prediksi

In [None]:
score = KNNlearn.score(x,y)
print("Accuracy: ", score * 100 ,"%")

KNNlearn.fit(x, y)
y_pred = KNNlearn.predict(x)

print("Confusion Matrix: ")
print(confusion_matrix(y, y_pred))