# Tugas Besar 2 Intelijensi Buatan

## Prediksi Income per Tahun

#### Anggota Kelompok:
- Devin Alvaro / 13515062
- Stevanno Hero Leadervand / 13515082
- Rizki Ihza / 13515104
- Gianfranco Fertino Hwandiano / 13515118

In [None]:
import pandas as pd
import numpy as np

from sklearn import preprocessing, neighbors, tree
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

%matplotlib inline

### Membaca dataset

In [None]:
train_df = pd.read_csv("data/CencusIncome.data.txt", header = None)

# name columns
train_df = train_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'})

# remove 'fnlwgt' column
train_df = train_df.drop(['fnlwgt'], axis=1)

train_df.head()

### *Preprocessing* dataset

In [None]:
le = preprocessing.LabelEncoder()

le.fit(train_df['label'])
train_df['label'] = le.transform(train_df['label'])

In [None]:
train_df = pd.get_dummies(train_df)

y = np.array(train_df['label'])
x = np.array(train_df.drop(['label'], 1))

train_df.head()

## Eksperimen untuk mendapatkan model terbaik

### Naive Bayes

In [None]:
gnb = GaussianNB()

score = cross_val_score(gnb, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### Decision Tree

In [None]:
ID3learn = tree.DecisionTreeClassifier(criterion="entropy")

score = cross_val_score(ID3learn, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### k-Nearest Neighbors

In [None]:
n_neighbors = 61

KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')

score = cross_val_score(KNNlearn, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.6f (+/- %0.6f)" % (score.mean(), score.std() * 2))

### Multilayer Perceptron

In [None]:
MLPlearn = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(5, 2))

score = cross_val_score(MLPlearn, x, y, cv=10)

In [None]:
for i in range(10):
    print("Fold-" + str(i + 1) + ":", "%0.6f" % score[i])

print()

print("Mean: %0.6f" % score.mean())
print("Accuration: %0.2f (+/- %0.6f)" % (score.mean(), score.std() * 2))

## Memilih model terbaik

setelah dilakukan percobaan learning dengan beberapa algoritma yaitu:

- Naive Bayes
- Decision Tree Learning
- K-Nearest neighbours
- Multilayer Perceptron

didapat model yang memiliki akurasi tertinggi untuk dataset ini adalah model
K-Nearest neighbours. Oleh karena itu dipilih model K-Nearest neighbour untuk
digunakan dalam perhitungan selanjutnya

In [None]:
KNNlearn = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
KNNlearn.fit(x, y)

### Menyimpan model

In [None]:
joblib.dump(KNNlearn, 'model/best.pkl')
joblib.dump(KNNlearn, '../webapp/model/best.pkl')

### *Loading* model

In [None]:
KNNlearn = joblib.load('model/best.pkl')

## Evaluasi dan prediksi dengan model terpilih

### Membaca test dataset

In [None]:
test_df = pd.read_csv("data/CencusIncome.test.txt", header=None, skiprows=1)

# name columns
test_df = test_df.rename(columns={0: 'age', 1: 'workclass', 2: 'fnlwgt', 3: 'education', 4: 'education-num', 5: 'marital-status', 6: 'occupation',7: 'relationship', 8: 'race',9: 'sex', 10: 'capital-gain', 11: 'capital-loss', 12: 'hours-per-week', 13: 'native-country', 14: 'label'})

# remove 'fnlwgt' column
test_df = test_df.drop(['fnlwgt'], axis=1)

test_df.head(10)

### *Preprocessing* test dataset

In [None]:
le = preprocessing.LabelEncoder()

le.fit(test_df['label'])
test_df['label'] = le.transform(test_df['label'])

In [None]:
test_df = pd.get_dummies(test_df)

missing_columns = set(train_df.columns) - set(test_df.columns)
for column in missing_columns:
    test_df[column] = 0

y = np.array(test_df['label'])
x = np.array(test_df.drop(['label'], 1))

print(np.shape(y))
print(np.shape(x))

test_df.head(10)

### Hasil prediksi

In [None]:
score = KNNlearn.score(x, y)
print("Accuracy: ", score * 100 ,"%")

y_pred = KNNlearn.predict(x)

print("Confusion Matrix: ")
print(confusion_matrix(y, y_pred))