In [60]:
! pip install pydotplus

In [61]:
%matplotlib inline
from matplotlib import pyplot as plt

plt.rcParams["figure.figsize"] = (10, 8)
import collections
from io import StringIO

import numpy as np
import pandas as pd
import pydotplus  # pip install pydotplus
import seaborn as sns
from ipywidgets import Image
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [62]:
# Create dataframe with dummy variables
def create_df(dic, feature_list):
    out = pd.DataFrame(dic)
    out = pd.concat([out, pd.get_dummies(out[feature_list])], axis=1)
    out.drop(feature_list, axis=1, inplace=True)
    return out


# Some feature values are present in train and absent in test and vice-versa.
def intersect_features(train, test):
    common_feat = list(set(train.keys()) & set(test.keys()))
    return train[common_feat], test[common_feat]

In [63]:
features = ["Looks", "Alcoholic_beverage", "Eloquence", "Money_spent"]

In [64]:
df_train = {}
df_train["Will_go"] = LabelEncoder().fit_transform(["+", "-", "+", "-", "-", "+", "+"])
df_train["Will_go"]

In [65]:
df_train = {}
df_train["Looks"] = [
    "handsome",
    "handsome",
    "handsome",
    "repulsive",
    "repulsive",
    "repulsive",
    "handsome",
]
df_train["Alcoholic_beverage"] = ["yes", "yes", "no", "no", "yes", "yes", "yes"]
df_train["Eloquence"] = ["high", "low", "average", "average", "low", "high", "average"]
df_train["Money_spent"] = ["lots", "little", "lots", "little", "lots", "lots", "lots"]
df_train["Will_go"] = LabelEncoder().fit_transform(["+", "-", "+", "-", "-", "+", "+"])

df_train = create_df(df_train, features)
df_train

In [66]:
df_test = {}
df_test["Looks"] = ["handsome", "handsome", "repulsive"]
df_test["Alcoholic_beverage"] = ["no", "yes", "yes"]
df_test["Eloquence"] = ["average", "high", "average"]
df_test["Money_spent"] = ["lots", "little", "lots"]
df_test = create_df(df_test, features)
df_test

In [67]:
# Some feature values are present in train and absent in test and vice-versa.
y = df_train["Will_go"]
df_train, df_test = intersect_features(train=df_train, test=df_test)
df_train

In [68]:
df_test

In [69]:
dt = DecisionTreeClassifier(criterion='entropy',random_state=17)
dt.fit(df_train,y)

In [70]:
dot_data = StringIO()
export_graphviz(decision_tree=dt,out_file=dot_data,filled=True,feature_names=df_train.columns,class_names=["Won't go", "Will go"])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

In [71]:
balls = [1 for i in range(9)] + [0 for i in range(11)]

In [72]:
balls

In [73]:
# two groups
balls_left = [1 for i in range(8)] + [0 for i in range(5)]  # 8 blue and 5 yellow
balls_right = [1 for i in range(1)] + [0 for i in range(6)]  # 1 blue and 6 yellow

In [74]:
from math import log

def entropy(a_list):
    lst = list(a_list)
    size = len(lst)
    entropy = 0
    set_elements = len(set(lst))
    if set_elements in [0, 1]:
        return 0
    for i in set(lst):
        occ = lst.count(i)
        entropy -= occ / size * log(occ / size, 2)
    return entropy

In [75]:
print(entropy(balls))  # 9 blue and 11 yellow ones
print(entropy(balls_left))  # 8 blue and 5 yellow ones
print(entropy(balls_right))  # 1 blue and 6 yellow ones
print(entropy([1, 2, 3, 4, 5, 6]))  # entropy of a fair 6-sided die

In [76]:
np.arange(1,7)

In [77]:
print(entropy(np.arange(1,7)))

In [78]:
# information gain calculation
def information_gain(root, left, right):
    """ root - initial data, left and right - two partitions of initial data"""
    root = list(root)
    left = list(left)
    right = list(right)
    size_root = len(root)
    size_left = len(left)
    size_right = len(right)
    
    return entropy(root) - (1.0 * (size_left/size_root) * entropy(left)) - (1.0 * (size_right/size_root) * entropy(right))
    # You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [79]:
print(information_gain(balls, balls_left, balls_right))

In [80]:
def information_gains(X, y):
    """Outputs information gain when splitting with each feature"""
    out = []
    
    for i in X.columns:
        out.append(information_gain(y,y[X[i] == 1],y[X[i] == 0]))
    
    return out

In [81]:
information_gains(df_train, y)

In [82]:
def btree(X, y, feature_names):
    clf = information_gains(X, y)
    best_feat_id = clf.index(max(clf))
    best_feature = feature_names[best_feat_id]
    print(f"Best feature to split: {best_feature}")
    
    x_left = X[X.iloc[:,best_feat_id] == 0]
    x_right = X[X.iloc[:,best_feat_id] == 1]
    print(f"Samples: {len(x_left)} (left) and {len(x_right)} (right)")
    
    y_left = y[X.iloc[:,best_feat_id] == 0]
    y_right = y[X.iloc[:,best_feat_id] == 1]
    entropy_left = entropy(y_left)
    entropy_right = entropy(y_right)
    print(f"Entropy: {entropy_left} (left) and {entropy_right} (right)")
    print("_" * 30 + "\n")
    
    if entropy_left != 0 :
        print(f"Splitting the left group with {len(x_left)} samples:")
        btree(x_left,y_left,feature_names)
    if entropy_right != 0:
        print(f"Splitting the right group with {len(x_right)} samples:")
        btree(x_right, y_right, feature_names)

In [83]:
btree(df_train, y, df_train.columns)

In [84]:
# for Jupyter-book, we copy data from GitHub, locally, to save Internet traffic,
# you can specify the data/ folder from the root of your cloned
# https://github.com/Yorko/mlcourse.ai repo, to save Internet traffic
DATA_PATH = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/"

In [85]:
data_train = pd.read_csv(DATA_PATH + "adult_train.csv", sep=";")

In [86]:
data_train.tail()

In [87]:
data_test = pd.read_csv(DATA_PATH + "adult_test.csv", sep=";")

In [88]:
data_test.tail()

In [89]:
# necessary to remove rows with incorrect labels in test dataset
data_test = data_test[
    (data_test["Target"] == " >50K.") | (data_test["Target"] == " <=50K.")
]

# encode target variable as integer
data_train.loc[data_train["Target"] == " <=50K", "Target"] = 0
data_train.loc[data_train["Target"] == " >50K", "Target"] = 1

data_test.loc[data_test["Target"] == " <=50K.", "Target"] = 0
data_test.loc[data_test["Target"] == " >50K.", "Target"] = 1

In [90]:
data_test.describe(include="all").T

In [91]:
data_train["Target"].value_counts()

In [92]:
data_train.shape

In [93]:
fig = plt.figure(figsize=(25, 15))
cols = 5
rows = int(data_train.shape[1] / cols)
for i, column in enumerate(data_train.columns):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(column)
    if data_train.dtypes[column] == np.object:
        data_train[column].value_counts().plot(kind="bar", axes=ax)
    else:
        data_train[column].hist(axes=ax)
        plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2);

In [94]:
data_train.dtypes

In [95]:
data_test.dtypes

In [96]:
data_test["Age"] = data_test["Age"].astype(int)

In [97]:
data_test["fnlwgt"] = data_test["fnlwgt"].astype(int)
data_test["Education_Num"] = data_test["Education_Num"].astype(int)
data_test["Capital_Gain"] = data_test["Capital_Gain"].astype(int)
data_test["Capital_Loss"] = data_test["Capital_Loss"].astype(int)
data_test["Hours_per_week"] = data_test["Hours_per_week"].astype(int)

In [98]:
# we see some missing values
data_train.info()

In [99]:
# choose categorical and continuous features from data

categorical_columns = [
    c for c in data_train.columns if data_train[c].dtype.name == "object"
]
numerical_columns = [
    c for c in data_train.columns if data_train[c].dtype.name != "object"
]

print("categorical_columns:", categorical_columns)
print("numerical_columns:", numerical_columns)

In [100]:
# we see some missing values
data_train.info()

In [101]:
# fill missing data

for c in categorical_columns:
    data_train[c].fillna(data_train[c].mode()[0], inplace=True)
    data_test[c].fillna(data_train[c].mode()[0], inplace=True)

for c in numerical_columns:
    data_train[c].fillna(data_train[c].median(), inplace=True)
    data_test[c].fillna(data_train[c].median(), inplace=True)

In [102]:
# no more missing values
data_train.info()

In [103]:
data_train = pd.concat(
    [data_train[numerical_columns], pd.get_dummies(data_train[categorical_columns])],
    axis=1,
)

data_test = pd.concat(
    [data_test[numerical_columns], pd.get_dummies(data_test[categorical_columns])],
    axis=1,
)

In [104]:
data_train

In [105]:
set(data_train.columns) - set(data_test.columns)

In [106]:
data_train.shape, data_test.shape

In [107]:
data_test["Country_ Holand-Netherlands"] = 0

In [108]:
set(data_train.columns) - set(data_test.columns)

In [109]:
data_train.head(2)

In [110]:
data_test.head(2)

In [111]:
X_train = data_train.drop(["Target"], axis=1)
y_train = data_train["Target"]

X_test = data_test.drop(["Target"], axis=1)
y_test = data_test["Target"]

In [112]:
tree = DecisionTreeClassifier(max_depth=3, random_state=17 , min_samples_leaf=5)
tree.fit(X_train, y_train)

In [113]:
tree_predictions = tree.predict(X_test[X_train.columns])

In [114]:
accuracy_score(y_test, tree_predictions)

In [116]:
%%time

tree_params = {"max_depth" : range(2,11) , "min_samples_leaf" : np.arange(2,11)}

locally_best_tree = GridSearchCV(
    DecisionTreeClassifier(random_state=17), tree_params, cv=5
)

locally_best_tree.fit(X_train, y_train)

In [117]:
print("Best params:", locally_best_tree.best_params_)
print("Best cross validaton score", locally_best_tree.best_score_)

In [119]:
tuned_tree = DecisionTreeClassifier(max_depth=10 , random_state=17 , min_samples_leaf=10)
tuned_tree.fit(X_train, y_train)
tuned_tree_predictions = tuned_tree.predict(X_test)
accuracy_score(y_test, tuned_tree_predictions)

In [120]:
rf = RandomForestClassifier(n_estimators=100, random_state=17)
rf.fit(X_train, y_train)

In [122]:
%%time
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf,X_train, y_train,cv=3)

In [123]:
cv_scores, cv_scores.mean()

In [124]:
forest_predictions = rf.predict(X_test)

In [125]:
accuracy_score(y_test, forest_predictions)

In [126]:
forest_params = {"max_depth": range(10, 16), "max_features": range(5, 105, 20)}

locally_best_forest = GridSearchCV(RandomForestClassifier(n_estimators=100,random_state=17,n_jobs=-1),
                                   forest_params,
                                  cv=3,
                                  verbose=1)

locally_best_forest.fit(X_train, y_train)

In [127]:
print("Best params:", locally_best_forest.best_params_)
print("Best cross validaton score", locally_best_forest.best_score_)

In [128]:
tuned_forest_predictions = locally_best_forest.predict(X_test)
accuracy_score(y_test, tuned_forest_predictions)