In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/ilp2021f/train_final.csv')
train['income>50K'] = train['income>50K'].astype(np.uint8)
test = pd.read_csv('/kaggle/input/ilp2021f/test_final.csv')

In [None]:
for non_numeric in train.select_dtypes(object):
    le = preprocessing.LabelEncoder()
    train[non_numeric] = le.fit_transform(train[non_numeric].values)
for non_numeric in test.select_dtypes(object):
    le = preprocessing.LabelEncoder()
    test[non_numeric] = le.fit_transform(test[non_numeric].values)

trainX = train.drop(columns=['income>50K'])
trainY = train[['income>50K']]

test.drop(columns=['ID'], inplace=True)

In [None]:
decision_tree = (tree.DecisionTreeClassifier(criterion='gini', max_depth=5))
decision_tree = decision_tree.fit(trainX, trainY)

plt.figure(figsize=(30,10))
tree.plot_tree(decision_tree, feature_names=train.columns, filled=True, label=True, fontsize=10)
plt.show()

predictions = decision_tree.predict(test)

submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
print(submission.info())
submission.to_csv('dt_entropy.csv', index=False)

In [None]:
random_forest = RandomForestClassifier(n_estimators=10)
random_forest = random_forest.fit(trainX, trainY.values.ravel())

predictions = random_forest.predict(test)

submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
print(submission.info())
submission.to_csv('random_forest.csv', index=False)

In [None]:
nbg = GaussianNB()
nbg.fit(trainX, trainY.values.ravel())

predictions = nbg.predict(test)

submission = pd.DataFrame({'ID': range(1,test.shape[0]+1), 'Prediction': predictions})
print(submission.info())
submission.to_csv('naive_bayes.csv', index=False)

In [None]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(trainX, trainY, test_size=test_size, random_state=seed)

xgb = XGBClassifier()
xgb = xgb.fit(X_train, Y_train)

predictions = xgb.predict(test)

submission = pd.DataFrame({'ID': range(1,23843), 'Prediction': predictions})
print(submission.info())
submission.to_csv('xgb_naive.csv', index=False)

In [None]:
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, Y_train, Y_test = train_test_split(trainX, trainY, test_size=test_size, random_state=seed)

grad_boost = GradientBoostingClassifier(n_estimators=100, 
                                 learning_rate=1.0, 
                                 max_depth=1, 
                                 random_state=0).fit(X_train, Y_train.values.ravel())
grad_boost.score(X_test, Y_test)

predictions = grad_boost.predict(test)

submission = pd.DataFrame({'ID': range(1,23843), 'Prediction': predictions})
print(submission.info())
submission.to_csv('gradient_boost.csv', index=False)