In [3]:
from dataclasses import dataclass, field
import h1st
import pandas as pd
import plotly.express as px
from sklearn import datasets

## Binary Classifier (setosa vs non-setosa) with 2 features (sepal_width, sepal_length)

In [51]:
df_raw = datasets.load_iris(as_frame=True).frame
df_raw.columns = ['sepal_length','sepal_width','petal_length','petal_width', 'species']

In [49]:
df_raw.shape

(150, 5)

In [53]:
df_raw['species'].value_counts()

0    50
1    50
2    50
Name: species, dtype: int64

In [58]:
# change label to setosa vs non-setosa
df_raw['species'] = df_raw['species'].apply(lambda x: 'setosa' if x==0 else 'non-setosa')

In [89]:
import plotly.graph_objects as go


sepal_length_max = 6.15
sepal_length_min = 4
sepal_width_min = 2.95
sepal_width_max = 4.6

fig = px.scatter(df_raw, x="sepal_width", y="sepal_length", color='species')

fig.add_shape(type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
)
# Create scatter trace of text labels
fig.add_trace(go.Scatter(
    x=[3.8],
    y=[6.35],
    text=["boolean_model"],
    mode="text",
))

# fig.add_hline(y=sepal_length_max)
# fig.add_vline(x=sepal_width_min)
fig.show()

In [87]:
# randomly split training and testing dataset
example_test_data_ratio = 0.4
df_raw = df_raw.sample(frac=1, random_state=7).reset_index(drop=True)
n = df_raw.shape[0]
n_test = int(n * example_test_data_ratio)
training_data = df_raw.iloc[n_test:, :].reset_index(drop=True)
test_data = df_raw.iloc[:n_test, :].reset_index(drop=True)
print('training_data.shape:', training_data.shape, 'test_data.shape:', test_data.shape)

training_data.shape: (90, 5) test_data.shape: (60, 5)


In [107]:
fig = px.scatter(training_data, x="sepal_width", y="sepal_length", 
                 color='species', title='original training data')
# fig.add_shape(type="rect",
#     x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
#     line=dict(color="LightSeaGreen"),
# )
# fig.add_trace(go.Scatter(
#     x=[3.8],
#     y=[6.35],
#     text=["boolean_model"],
#     mode="text",
# ))
fig.show()

In [11]:
fig = px.scatter(test_data, x="sepal_width", y="sepal_length", color='species', title='test data')
fig.add_shape(type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
)
fig.add_trace(go.Scatter(
    x=[3.8],
    y=[6.35],
    text=["boolean_model"],
    mode="text",
))
fig.show()

In [12]:
X_train = training_data[['sepal_length', 'sepal_width']]
y_train = training_data['species']
X_test = test_data[['sepal_length', 'sepal_width']]
y_test = test_data['species']

In [65]:
@dataclass
class RuleModel:
    sepal_length_max: float = 6.0
    sepal_length_min: float = 4.0
    sepal_width_min: float = 3.0
    sepal_width_max: float = 4.6
    
    def predict(self, df):
        return pd.Series(map(self.predict_setosa, df['sepal_length'], df['sepal_width']), 
                         name='prediction')

    
    def predict_setosa(self, sepal_length, sepal_width):
        return 0 if (self.sepal_length_min <= sepal_length <= self.sepal_length_max) \
                  & (self.sepal_width_min <= sepal_width <= self.sepal_width_max) \
               else 1

In [66]:
rule_model = RuleModel()

In [67]:
y_pred_teacher = rule_model.predict(X_test)

In [68]:
from sklearn import metrics

{'micro_f1_score': metrics.f1_score(y_test, y_pred_teacher, average='micro')}

{'micro_f1_score': 0.85}

In [69]:
metrics.confusion_matrix(y_test, y_pred_teacher)

array([[17,  2],
       [ 7, 34]])

## Build Student Model with Good Teacher

In [70]:
from sklearn.linear_model import LogisticRegression

### Train with Original Training Data

X_train = training_data[['sepal_length', 'sepal_width']]
y_train = training_data['species']

student = LogisticRegression(random_state=0)
student.fit(X_train, y_train)

y_pred_student = student.predict(X_test)

metrics.f1_score(y_test, y_pred_student, average='micro')

metrics.confusion_matrix(y_test, y_pred_student)

### Train with Training Data generated by Teacher 

In [92]:
X_train = training_data[['sepal_length', 'sepal_width']]
y_train_teacher = rule_model.predict(X_train)

In [93]:
metrics.f1_score(y_train, y_train_teacher, average='macro')

0.9757543103448276

In [94]:
metrics.confusion_matrix(y_train, y_train_teacher)

array([[31,  0],
       [ 2, 57]])

In [95]:
df_training_data_from_teacher = pd.concat([X_train, y_train_teacher], axis=1)
fig = px.scatter(
    df_training_data_from_teacher, 
    x="sepal_width", 
    y="sepal_length", 
    color='prediction', 
    title='training_data_from_teacher')
fig.add_shape(type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
)
fig.add_trace(go.Scatter(
    x=[3.8],
    y=[6.35],
    text=["boolean_model"],
    mode="text",
))
fig.show()

In [75]:
student = LogisticRegression(random_state=0)
student.fit(X_train, y_train_teacher)

LogisticRegression(random_state=0)

In [76]:
y_pred_student = student.predict(X_test)

In [77]:
metrics.f1_score(y_test, y_pred_student, average='micro')

1.0

In [78]:
metrics.confusion_matrix(y_test, y_pred_student)

array([[19,  0],
       [ 0, 41]])

In [79]:
y_pred_student = student.predict(X_test)

metrics.f1_score(y_test, y_pred_student, average='micro')

metrics.confusion_matrix(y_test, y_pred_student)

array([[19,  0],
       [ 0, 41]])

In [85]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

trace1 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
#     marker=y_test.to_list(),
    mode="markers",
    marker = dict(color = y_test.to_list()),
)
trace2 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
    mode="markers",    
#     marker=y_pred_teacher.to_list(),    
    marker = dict(color = y_pred_teacher.to_list()), 
)
trace3 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
    mode="markers",    
#     marker=y_pred_student,    
    marker = dict(color = y_pred_student),    
)

# trace2.add_shape(type="rect",
#     x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
#     line=dict(color="LightSeaGreen"),
# )
sepal_length_max = 6.0
sepal_length_min = 4.0
sepal_width_min = 3.0
sepal_width_max = 4.6



fig = make_subplots(rows=1, cols=3, shared_yaxes=True, 
                    subplot_titles=(
                        "Original Test Data", 
                        "Prediction from Teacher", 
                        "Prediction from Student"))

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

fig.add_shape(
    type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
    row=1,
    col=2
)

fig.update_layout(xaxis=dict(tickangle=90))
fig.show()

In [86]:
teacher_f1 = metrics.f1_score(y_test, y_pred_teacher, average='micro')
student_f1 = metrics.f1_score(y_test, y_pred_student, average='micro')
print(f'teacher_f1: {teacher_f1:.3f}')
print(f'student_f1: {student_f1:.3f}')

teacher_f1: 0.850
student_f1: 1.000


## Build Student Model with Bad Teacher

In [96]:
import plotly.graph_objects as go


sepal_length_max = 6.5
sepal_length_min = 4
sepal_width_min = 2.8
sepal_width_max = 4

fig = px.scatter(df_raw, x="sepal_width", y="sepal_length", color='species', symbol="species")

fig.add_shape(type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
)
# Create scatter trace of text labels
fig.add_trace(go.Scatter(
    x=[3.8],
    y=[6.35],
    text=["boolean_model"],
    mode="text",
))

# fig.add_hline(y=sepal_length_max)
# fig.add_vline(x=sepal_width_min)
fig.show()

In [98]:
rule_model = RuleModel(
    sepal_length_max=6.5, sepal_length_min=4, sepal_width_min=2.8, sepal_width_max=4.0)

In [99]:
y_pred_teacher = rule_model.predict(X_test)

In [100]:
from sklearn import metrics

{'micro_f1_score': metrics.f1_score(y_test, y_pred_teacher, average='micro')}

{'micro_f1_score': 0.6833333333333333}

In [101]:
metrics.confusion_matrix(y_test, y_pred_teacher)

array([[17,  2],
       [17, 24]])

In [102]:
X_train = training_data[['sepal_length', 'sepal_width']]
y_train_teacher = rule_model.predict(X_train)

In [103]:
metrics.f1_score(y_train, y_train_teacher, average='macro')

0.7205316109800025

In [104]:
y_train_teacher.value_counts()

0    52
1    38
Name: prediction, dtype: int64

In [105]:
metrics.confusion_matrix(y_train, y_train_teacher)

array([[29,  2],
       [23, 36]])

In [106]:
df_training_data_from_teacher = pd.concat([X_train, y_train_teacher], axis=1)
fig = px.scatter(
    df_training_data_from_teacher, 
    x="sepal_width", 
    y="sepal_length", 
    color='prediction', 
    title='training_data_from_teacher')
fig.add_shape(type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
)
fig.add_trace(go.Scatter(
    x=[3.8],
    y=[6.35],
    text=["boolean_model"],
    mode="text",
))
fig.show()

In [41]:
from sklearn.ensemble import RandomForestClassifier
from autosklearn.classification import AutoSklearnClassifier

In [42]:
student = LogisticRegression(random_state=0)
# student = RandomForestClassifier(max_depth=2, random_state=0)
# student = AutoSklearnClassifier(time_left_for_this_task=5*60, per_run_time_limit=30, n_jobs=4)
student.fit(X_train, y_train_teacher)

LogisticRegression(random_state=0)

In [43]:
y_pred_student = student.predict(X_test)

In [44]:
metrics.f1_score(y_test, y_pred_student, average='micro')

0.8333333333333334

In [45]:
metrics.confusion_matrix(y_test, y_pred_student)

array([[19,  0],
       [10, 31]])

In [46]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

trace1 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
#     marker=y_test.to_list(),
    mode="markers",
    marker = dict(color = y_test.to_list()),
)
trace2 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
    mode="markers",    
#     marker=y_pred_teacher.to_list(),    
    marker = dict(color = y_pred_teacher.to_list()), 
)
trace3 = go.Scatter(
    x=X_test['sepal_width'],
    y=X_test['sepal_length'],
    mode="markers",    
#     marker=y_pred_student,    
    marker = dict(color = y_pred_student),    
)

fig = make_subplots(rows=1, cols=3, shared_yaxes=True, 
                    subplot_titles=(
                        "Original Test Data", 
                        "Prediction from Teacher", 
                        "Prediction from Student"))

fig['layout']['xaxis']['title']='Label x-axis 1'


fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=1, col=2)
fig.add_trace(trace3, row=1, col=3)

# sepal_length_max = 6.0
# sepal_length_min = 4
# sepal_width_min = 3.0
# sepal_width_max = 4.3

fig.add_shape(
    type="rect",
    x0=sepal_width_min, y0=sepal_length_min, x1=sepal_width_max, y1=sepal_length_max,
    line=dict(color="LightSeaGreen"),
    row=1,
    col=2
)

# Update xaxis properties
fig.update_xaxes(title_text="sepal_width", row=1, col=1)
fig.update_xaxes(title_text="sepal_width", row=1, col=2)
fig.update_xaxes(title_text="sepal_width", row=1, col=3)

# Update yaxis properties
fig.update_yaxes(title_text="sepal_length", row=1, col=1)

fig.update_layout(xaxis=dict(tickangle=90))
fig.show()

In [47]:
teacher_f1 = metrics.f1_score(y_test, y_pred_teacher, average='micro')
student_f1 = metrics.f1_score(y_test, y_pred_student, average='micro')
print(f'teacher_f1: {teacher_f1:.3f}')
print(f'student_f1: {student_f1:.3f}')

teacher_f1: 0.683
student_f1: 0.833


In [48]:
# Should I try multi class ? 
# Should I try multi label ? 