# Test Pipeline

## Import Moduls & Globals

In [1]:
from utils.libs import *

In [2]:
packages_version_dict = \
{
    "keras": str(keras.__version__),
    "sklearn": str(sklearn.__version__),
    "tf": str(tf.__version__),
}

In [3]:
packages_version_df = pd.DataFrame(data = packages_version_dict.values(), index = packages_version_dict.keys(), columns = ['Version'])
print(packages_version_df)

        Version
keras     2.4.3
sklearn  0.23.1
tf        2.2.0


In [4]:
os.getcwd()
set_config(display='diagram')

SEED = 1234
RANDOM_STATE = 42

N_FOLDS = 10

N_COMPONENTS = 2

target_names = ['Occupancy', 'Non-Occupancy']

np.random.seed(seed = SEED)

### Fetch Train Data

In [5]:
dir_path = "C:\\Users\\Francesco\\Documents\\datasets\\occupancy_data"
file_name = "datatraining.txt"

file_path = os.path.join(dir_path, file_name)

print(file_path)

C:\Users\Francesco\Documents\datasets\occupancy_data\datatraining.txt


In [6]:
train_data = pd.read_csv(file_path)

In [7]:
train_data.describe(include="all")

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,8143,8143.0,8143.0,8143.0,8143.0,8143.0,8143.0
unique,8143,,,,,,
top,2015-02-05 18:36:59,,,,,,
freq,1,,,,,,
mean,,20.619084,25.731507,119.519375,606.546243,0.003863,0.21233
std,,1.016916,5.531211,194.755805,314.320877,0.000852,0.408982
min,,19.0,16.745,0.0,412.75,0.002674,0.0
25%,,19.7,20.2,0.0,439.0,0.003078,0.0
50%,,20.39,26.2225,0.0,453.5,0.003801,0.0
75%,,21.39,30.533333,256.375,638.833333,0.004352,0.0


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8143 entries, 1 to 8143
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           8143 non-null   object 
 1   Temperature    8143 non-null   float64
 2   Humidity       8143 non-null   float64
 3   Light          8143 non-null   float64
 4   CO2            8143 non-null   float64
 5   HumidityRatio  8143 non-null   float64
 6   Occupancy      8143 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 508.9+ KB


### Fetch Test Data

In [9]:
dir_path = "C:\\Users\\Francesco\\Documents\\datasets\\occupancy_data"
file_name = "datatest.txt"

file_path_test_set = os.path.join(dir_path, file_name)

print(file_path)

C:\Users\Francesco\Documents\datasets\occupancy_data\datatraining.txt


In [10]:
test_data = pd.read_csv(file_path_test_set)

In [11]:
test_data.describe(include="all")

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
count,2665,2665.0,2665.0,2665.0,2665.0,2665.0,2665.0
unique,2665,,,,,,
top,2015-02-02 17:22:00,,,,,,
freq,1,,,,,,
mean,,21.433876,25.353937,193.227556,717.90647,0.004027,0.364728
std,,1.028024,2.436842,250.210906,292.681718,0.000611,0.481444
min,,20.2,22.1,0.0,427.5,0.003303,0.0
25%,,20.65,23.26,0.0,466.0,0.003529,0.0
50%,,20.89,25.0,0.0,580.5,0.003815,0.0
75%,,22.356667,26.856667,442.5,956.333333,0.004532,1.0


In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2665 entries, 140 to 2804
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           2665 non-null   object 
 1   Temperature    2665 non-null   float64
 2   Humidity       2665 non-null   float64
 3   Light          2665 non-null   float64
 4   CO2            2665 non-null   float64
 5   HumidityRatio  2665 non-null   float64
 6   Occupancy      2665 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 166.6+ KB


### Define Pipeline

In [13]:
X_train, y_train = train_data.iloc[:, 1:-1], train_data.iloc[:, -1].values
X_test, y_test = test_data.iloc[:, 1:-1], test_data.iloc[:, -1].values

X_train, y_train = shuffle(X_train, y_train, random_state=RANDOM_STATE)

In [32]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    # ('scaler', StandardScaler())
    ('scaler', Normalizer())
    
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, selector(dtype_exclude="category")),
    ('cat', categorical_transformer, selector(dtype_include="category"))
])

transformer = random_projection.SparseRandomProjection(n_components = 5, random_state = 0) # None
# transformer = PCA(n_components = 2)

rt = RandomTreesEmbedding(max_depth=3, n_estimators=10, random_state=0)

# clf = LogisticRegression(max_iter=1000, random_state = 0)

clf = build_model(n_features = 5, n_classes = 2)
# clf = KerasClassifier(build_fn=model, verbose=0)

In [33]:
pipe = Pipeline(
    [
        ('preprocessor', preprocessor),
        # ('transformer', transformer),
    ]
)

In [34]:
pipe

In [35]:
optimizers = ['rmsprop']
init = ['glorot_uniform']
epochs = [50]
batches = [5]
param_grid = dict(clf__optimizer=optimizers, clf__epochs=epochs, clf__batch_size=batches, clf__init=init)
grid = GridSearchCV(estimator=pipe, param_grid=param_grid)

In [36]:
clf_name = str(clf).split('(')[0]
trfm_name = str(rt).split('(')[0]

map_2_acronym = lambda xx: re.sub('[a-z]', '', str(xx[1]).split('(')[0])
pipe_elems = list(map(map_2_acronym, pipe.steps[:]))[1:]
                                  
label = ' + '.join([xx for xx in pipe_elems])
print(label)




### Train & Evaluate Model

In [37]:
pipe.fit(X_train)
X_train_tfrm = pipe.transform(X_train)
clf.fit(
    X_train_tfrm, y_train,
    epochs = 5,
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x21b76edda88>

In [None]:
X_ test_tfrm = pipe.transform(X_test)
y_pred_rt = pipe.predict(test_tfrm)
fpr_rt_clf, tpr_rt_clf, _ = roc_curve(y_test, y_pred_rt)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rt_clf, tpr_rt_clf, label=label)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title(f'ROC curve - {clf_name}')
plt.legend(loc='best')
plt.show()

In [None]:
results = clf.score(X_test, y_test)
# print(f"Accuracy: {results:.4f}")
print(f"Accuracy(%): {results * 100:.2f}%")

In [None]:
y_pred = pipe.predict(test_tfrm)
# matrix = metrics.confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
show_cm(y_test, y_pred, target_names, n_classes = 2, title = 'Confusion Matrix - Occupancy Dataset')

In [None]:
pprint(classification_report(y_test, y_pred))