**Author:** [Riccardo Guidotti](http://kdd.isti.cnr.it/people/riccardo-guidotti)  
**Python version:**  3.x

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

# Data Preparation

In [2]:
class_name = 'Occupancy'
df = pd.read_csv('occupancy_data/datatraining.txt', skipinitialspace=True, na_values='?', keep_default_na=True)
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [3]:
def daytime(h):
    if 6 <= h < 13:
        return 'morning'
    elif 13 <= h < 18:
        return 'afternoon'
    elif 18 <= h < 22:
        return 'evening'
    return 'night'

In [4]:
df['daytime'] = [daytime(d.hour) for d in pd.to_datetime(df['date'])]
df['weekend'] = [1 if d.weekday() >= 5 else 0 for d in pd.to_datetime(df['date'])]
columns2remove = ['date']
df.drop(columns2remove, inplace=True, axis=1)
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,daytime,weekend
1,23.18,27.272,426.0,721.25,0.004793,1,afternoon,0
2,23.15,27.2675,429.5,714.0,0.004783,1,afternoon,0
3,23.15,27.245,426.0,713.5,0.004779,1,afternoon,0
4,23.15,27.2,426.0,708.25,0.004772,1,afternoon,0
5,23.1,27.2,426.0,704.5,0.004757,1,afternoon,0


In [5]:
df0 = df.copy()

In [6]:
dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
dfY = df[class_name]
df = pd.concat([dfX, dfY], axis=1)
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,weekend,daytime=afternoon,daytime=evening,daytime=morning,daytime=night,Occupancy
1,23.18,27.272,426.0,721.25,0.004793,0,1,0,0,0,1
2,23.15,27.2675,429.5,714.0,0.004783,0,1,0,0,0,1
3,23.15,27.245,426.0,713.5,0.004779,0,1,0,0,0,1
4,23.15,27.2,426.0,708.25,0.004772,0,1,0,0,0,1
5,23.1,27.2,426.0,704.5,0.004757,0,1,0,0,0,1


# Data Partitioning

In [7]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [8]:
attributes = [col for col in df.columns if col != class_name]
X = df[attributes].values
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

# RIPPER

https://github.com/imoscovitz/wittgenstein

In [10]:
import wittgenstein as lw

In [11]:
ripper_clf = lw.RIPPER()

In [12]:
ripper_clf.fit(X_train, y_train)

In [13]:
ripper_clf

<RIPPER(verbosity=0, k=2, max_rules=None, dl_allowance=64, random_state=None, prune_size=0.33, max_rule_conds=None, n_discretize_bins=10, max_total_conds=None)>

In [14]:
ripper_clf.ruleset_

<Ruleset [2=433.0-474.0^3=1081.8-2028.5] V [3=811.75-1081.8^7=0.0^2=433.0-474.0^0=21.47-22.02] V [3=811.75-1081.8^7=0.0^8=1.0^2=433.0-474.0^0=21.0-21.47] V [3=811.75-1081.8^7=0.0^8=1.0^0=22.02-22.6^1=24.6-26.29] V [2=474.0-1546.33^0=21.47-22.02^1=33.4-39.12] V [3=811.75-1081.8^7=0.0^2=433.0-474.0^1=29.1-31.6] V [3=811.75-1081.8^7=0.0^2=147.5-433.0] V [2=474.0-1546.33^1=19.5-21.2] V [0=22.02-22.6^3=811.75-1081.8^1=26.29-27.1] V [2=147.5-433.0^5=0.0^3=551.25-811.75] V [2=474.0-1546.33^3=1081.8-2028.5^4=0.0-0.01] V [0=22.02-22.6^3=1081.8-2028.5] V [2=433.0-474.0^6=0.0] V [2=474.0-1546.33^1=21.2-24.6] V [2=474.0-1546.33^0=21.47-22.02^1=24.6-26.29] V [2=474.0-1546.33^1=27.1-29.1] V [2=147.5-433.0^5=0.0^0=20.6-21.0] V [2=433.0-474.0^3=811.75-1081.8^1=27.1-29.1] V [2=474.0-1546.33^1=18.89-19.5^0=21.0-21.47] V [2=474.0-1546.33^0=21.47-22.02] V [2=147.5-433.0^5=0.0^4=0.0-0.01] V [2=433.0-474.0^1=19.5-21.2] V [3=811.75-1081.8^2=433.0-474.0] V [2=147.5-433.0^5=0.0^3=468.0-551.25^1=18.89-19.5] V [

In [15]:
ripper_clf.score(X_test, y_test)

0.8215309046254605

In [16]:
ripper_clf.fit(X_train, y_train, feature_names=attributes)

In [17]:
ripper_clf.ruleset_

<Ruleset [Light=433.0-474.0^CO2=1081.8-2028.5] V [CO2=811.75-1081.8^Light=433.0-474.0^daytime=afternoon=0.0] V [CO2=811.75-1081.8^daytime=evening=0.0^Humidity=21.2-24.6] V [Light=474.0-1546.33^Humidity=19.5-21.2^CO2=551.25-811.75] V [Light=147.5-433.0^weekend=0.0^daytime=morning=0.0] V [Light=474.0-1546.33^HumidityRatio=0.0-0.01] V [CO2=811.75-1081.8^daytime=evening=0.0^Humidity=19.5-21.2] V [Light=433.0-474.0^Humidity=26.29-27.1] V [CO2=551.25-811.75^daytime=morning=1.0^Light=147.5-433.0] V [Light=433.0-474.0^daytime=afternoon=0.0] V [Light=474.0-1546.33^Humidity=27.1-29.1] V [Light=474.0-1546.33^Temperature=21.47-22.02^daytime=afternoon=0.0] V [Light=474.0-1546.33^Humidity=18.89-19.5^Temperature=21.0-21.47] V [Light=474.0-1546.33^Humidity=26.29-27.1^Temperature=22.02-22.6] V [Light=474.0-1546.33^Temperature=21.47-22.02] V [Light=147.5-433.0^weekend=0.0^Temperature=20.29-20.6] V [CO2=811.75-1081.8^Light=433.0-474.0^Humidity=27.1-29.1] V [Light=147.5-433.0^weekend=0.0^Temperature=20.6-

# Rule Lists Classifiers in Skater

https://github.com/oracle/Skater

https://github.com/oracle/Skater/blob/master/examples/rule_list_notebooks/rule_lists_titanic_dataset.ipynb

# CN2

In [19]:
import Orange

In [20]:
titanic = Orange.data.Table("titanic")

In [21]:
titanic

[[first, adult, male | yes],
 [first, adult, male | yes],
 [first, adult, male | yes],
 [first, adult, male | yes],
 [first, adult, male | yes],
 ...
]

In [22]:
cn2_learner = Orange.classification.rules.CN2Learner()
cn2_classifier = cn2_learner(titanic)

In [23]:
for r in cn2_classifier.rule_list:
    print(r)

IF sex==female AND status==first AND age!=adult THEN survived=yes 
IF sex==female AND status!=third AND age!=adult THEN survived=yes 
IF sex!=female AND status==second AND age!=adult THEN survived=yes 
IF sex==female AND status==first THEN survived=yes 
IF status!=third AND age!=adult THEN survived=yes 
IF sex!=female AND status==second THEN survived=no 
IF status==crew AND sex!=male THEN survived=yes 
IF status==second THEN survived=yes 
IF sex!=female AND status==third AND age!=child THEN survived=no 
IF status==crew THEN survived=no 
IF sex!=female AND status!=first THEN survived=no 
IF status==first THEN survived=no 
IF age!=adult THEN survived=no 
IF status==third THEN survived=no 
IF TRUE THEN survived=no 


In [26]:
Orange.data.Domain([Orange.data.Variable(c, compute_value=None) for c in df.columns])

TypeError: variables must be primitive

In [None]:
data = Orange.data.Table.from_numpy(domain=np.array(df.columns), X=X_train, Y=y_train)

In [None]:
print(data.columns)

In [None]:
data

In [None]:
data.__dict__

In [None]:
cn2_learner = Orange.classification.rules.CN2Learner()
cn2_classifier = cn2_learner(data)

In [None]:
for r in cn2_classifier.rule_list:
    print(r)