In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

# Underfitting and overfitting
## Decision tree classifier

In [None]:
titanic = pd.read_csv('../data/titanic.csv')

# prepare data
titanic = titanic.drop(columns=['Name']) # drop the column 'Name'
is_F = (titanic['Sex']=='female') # array of True and False
titanic['Sex'] = is_F.astype(int) # 1 = female, 0 = male

# split to train and tests
train = titanic.sample(frac=0.8) # 80% rows for training
test = titanic.drop(index=train.index)

# split to X and y
y_train = train['Survived']
X_train = train.drop(columns=['Survived'])
print(X_train.shape, y_train.shape)

y_test = test['Survived']
X_test = test.drop(columns=['Survived']) 
print(X_test.shape, y_test.shape)

In [None]:
from sklearn import tree

T2 = tree.DecisionTreeClassifier(max_depth=      )
T20 = tree.DecisionTreeClassifier(max_depth=      )

T2.fit(X_train, y_train)
T20.fit(X_train, y_train)

In [None]:
fig, ax = plt.subplots(1, figsize = (10, 10))
p = tree.plot_tree(T2, 
                   filled=True, 
                   feature_names=X_train.columns)

In [None]:
fig, ax = plt.subplots(1, figsize = (10, 10))
p = tree.plot_tree(T20, 
                   filled=True, 
                   feature_names=X_train.columns)

In [None]:
print('max_depth 2')
print('Train score:',T2.score(    ,   ))
print('Test score:',T2.score(   ,  ))
print()
print('max_depth 20')
print('Train score:',T20.score(   ,  ))
print('Test score:',T20.score(   ,   ))

In [None]:
depths = range(1, 31)
train_scores = []
test_scores = []

for depth in depths:
    T = tree.DecisionTreeClassifier(max_depth=      , criterion='gini')
    T.fit(   ,   )
    train_scores.append(T.score(   ,   ))
    test_scores.append(T.score(   ,   ))

fig, ax = plt.subplots(1)
sns.scatterplot(x=depths, y=train_scores, label='train')
sns.scatterplot(x=depths, y=test_scores, label='test')
ax.set_xlabel('Depth of tree')
ax.set_ylabel('Accuracy')

## Polynomial regression

In [None]:
# controls random number generation
# always get the same data
np.random.seed(1234) 

# true model is linear with a = 1 and b = 1
a = 1
b = 1

n_points = 100

X = np.random.rand(n_points)
Y = a*X + b + 0.2*np.random.randn(n_points) # final term is random noise

In [None]:
fig, ax = plt.subplots(1)

ax.plot([0,1], [1, 2], color = "black", label = "true model")
ax.scatter(X, Y, label = "data")
ax.set(xlabel='X', ylabel='Y')
plt.legend()

In [None]:
df = pd.DataFrame(data={'Y': Y, 'X': X})
train = df.sample(frac=0.8) # 80% rows for training
test = df.drop(index=train.index) # rest of rows for testing
print(train.shape, test.shape)

In [None]:
y_train = train['Y']
X_train = train.drop(columns=['Y'])
print(X_train.shape, y_train.shape)

y_test = test['Y']
X_test = test.drop(columns=['Y']) 
print(X_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [None]:
lr1 = PolynomialRegression(      ).fit(X_train, y_train)
lr20 = PolynomialRegression(      ).fit(X_train, y_train)

In [None]:
prediction_inputs = pd.DataFrame(data={'X': np.linspace(0.01, 1, 1000)})

fig, ax = plt.subplots(1)

ax.plot(prediction_inputs, lr1.      (prediction_inputs), color = "red", label = "linear")
ax.plot(prediction_inputs, lr20.      (prediction_inputs), color = "green", label = "degree 20")

ax.scatter(X_train, y_train, marker='*', label = "train data")
ax.scatter(X_test, y_test, label = "test data")

ax.set(xlabel='X', ylabel='Y')
plt.legend()

In [None]:
print('degree 1 (linear)')
print('Train score:', lr1.score(  ,    ))
print('Test score:', lr1.score(   ,   ))
print()
print('degree 20')
print('Train score:', lr20.score(   ,   ))
print('Test score:', lr20.score(   ,   ))


In [None]:
degrees = range(1, 31)
train_scores = []
test_scores = []

for degree in degrees:
    lr = PolynomialRegression(      ).fit(   ,   )
    train_scores.append(lr.score(  ,    ))
    test_scores.append(lr.score(   ,   ))

fig, ax = plt.subplots(1)
sns.scatterplot(x=degrees, y=train_scores, label='train')
sns.scatterplot(x=degrees, y=test_scores, label='test')
ax.set_xlabel('Degree of polynomial regression')
ax.set_ylabel('$R^2$ score')