**Author:** [Riccardo Guidotti](http://kdd.isti.cnr.it/people/riccardo-guidotti)  
**Python version:**  3.x

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

# Data Preparation

In [2]:
class_name = 'Occupancy'
df = pd.read_csv('occupancy_data/datatraining.txt', skipinitialspace=True, na_values='?', keep_default_na=True)
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8143 entries, 1 to 8143
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           8143 non-null   object 
 1   Temperature    8143 non-null   float64
 2   Humidity       8143 non-null   float64
 3   Light          8143 non-null   float64
 4   CO2            8143 non-null   float64
 5   HumidityRatio  8143 non-null   float64
 6   Occupancy      8143 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 508.9+ KB


In [4]:
pd.to_datetime(df['date']).iloc[0].hour

17

In [5]:
pd.to_datetime(df['date']).iloc[0].weekday()

2

In [6]:
def daytime(h):
    if 6 <= h < 13:
        return 'morning'
    elif 13 <= h < 18:
        return 'afternoon'
    elif 18 <= h < 22:
        return 'evening'
    return 'night'

In [7]:
df['daytime'] = [daytime(d.hour) for d in pd.to_datetime(df['date'])]

In [8]:
df['weekend'] = [1 if d.weekday() >= 5 else 0 for d in pd.to_datetime(df['date'])]

In [9]:
columns2remove = ['date']
df.drop(columns2remove, inplace=True, axis=1)
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,daytime,weekend
1,23.18,27.272,426.0,721.25,0.004793,1,afternoon,0
2,23.15,27.2675,429.5,714.0,0.004783,1,afternoon,0
3,23.15,27.245,426.0,713.5,0.004779,1,afternoon,0
4,23.15,27.2,426.0,708.25,0.004772,1,afternoon,0
5,23.1,27.2,426.0,704.5,0.004757,1,afternoon,0


In [10]:
df0 = df.copy()

In [11]:
dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
dfX.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,weekend,daytime=afternoon,daytime=evening,daytime=morning,daytime=night
1,23.18,27.272,426.0,721.25,0.004793,0,1,0,0,0
2,23.15,27.2675,429.5,714.0,0.004783,0,1,0,0,0
3,23.15,27.245,426.0,713.5,0.004779,0,1,0,0,0
4,23.15,27.2,426.0,708.25,0.004772,0,1,0,0,0
5,23.1,27.2,426.0,704.5,0.004757,0,1,0,0,0


In [12]:
dfY = df[class_name]
dfY.head()

1    1
2    1
3    1
4    1
5    1
Name: Occupancy, dtype: int64

In [13]:
df = pd.concat([dfX, dfY], axis=1)
df.head()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,weekend,daytime=afternoon,daytime=evening,daytime=morning,daytime=night,Occupancy
1,23.18,27.272,426.0,721.25,0.004793,0,1,0,0,0,1
2,23.15,27.2675,429.5,714.0,0.004783,0,1,0,0,0,1
3,23.15,27.245,426.0,713.5,0.004779,0,1,0,0,0,1
4,23.15,27.2,426.0,708.25,0.004772,0,1,0,0,0,1
5,23.1,27.2,426.0,704.5,0.004757,0,1,0,0,0,1


In [14]:
df0.copy()

Unnamed: 0,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,daytime,weekend
1,23.18,27.2720,426.0,721.250000,0.004793,1,afternoon,0
2,23.15,27.2675,429.5,714.000000,0.004783,1,afternoon,0
3,23.15,27.2450,426.0,713.500000,0.004779,1,afternoon,0
4,23.15,27.2000,426.0,708.250000,0.004772,1,afternoon,0
5,23.10,27.2000,426.0,704.500000,0.004757,1,afternoon,0
...,...,...,...,...,...,...,...,...
8139,21.05,36.0975,433.0,787.250000,0.005579,1,morning,0
8140,21.05,35.9950,433.0,789.500000,0.005563,1,morning,0
8141,21.10,36.0950,433.0,798.500000,0.005596,1,morning,0
8142,21.10,36.2600,433.0,820.333333,0.005621,1,morning,0


# Data Partitioning

In [15]:
from sklearn.model_selection import train_test_split, cross_val_score 

from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [16]:
attributes = [col for col in df.columns if col != class_name]
X = df[attributes].values
y = df[class_name]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, stratify=y)

# Naive Bayes

In [17]:
from sklearn.naive_bayes import GaussianNB, CategoricalNB

In [18]:
clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB()

In [19]:
y_pred = clf.predict(X_test)

In [20]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9889480147359804
F1-score [0.99295223 0.97440758]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1924
           1       0.96      0.99      0.97       519

    accuracy                           0.99      2443
   macro avg       0.98      0.99      0.98      2443
weighted avg       0.99      0.99      0.99      2443



In [21]:
X_train_num, X_test_num, y_train, y_test = train_test_split(
    df[[c for c in df.columns if c not in ['daytime', 'weekend', 'Occupancy']]], df[class_name], 
       test_size=0.3, random_state=100, stratify=df[class_name])

In [22]:
clf = GaussianNB()
clf.fit(X_train_num, y_train)
y_pred = clf.predict(X_test_num)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9664347114203847
F1-score [0.9782839  0.92612613]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      1924
           1       0.87      0.99      0.93       519

    accuracy                           0.97      2443
   macro avg       0.93      0.98      0.95      2443
weighted avg       0.97      0.97      0.97      2443



In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
df0_cat = df0[['daytime', 'weekend']].apply(LabelEncoder().fit_transform)

In [25]:
X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    df0_cat, df[class_name], test_size=0.3, random_state=100, stratify=df[class_name])

In [26]:
clf = CategoricalNB()
clf.fit(X_train_cat, y_train)

CategoricalNB()

In [27]:
y_pred = clf.predict(X_test_cat)
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.9136307818256242
F1-score [0.94217594 0.82942603]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94      1924
           1       0.71      0.99      0.83       519

    accuracy                           0.91      2443
   macro avg       0.86      0.94      0.89      2443
weighted avg       0.94      0.91      0.92      2443

