In [1]:
# ==============================================================================
# File: IOT_Temp_Classification.ipynb
# Author: Vraj Shah
# Revisions by: Francisco Cornejo-Garcia
# Description: Classify date and time
# ==============================================================================

# Load models
from downstream_models import LogRegClassifier, RandForestClassifier, LinearRegression, RandForestRegressor, MLPRegressorr, MLPClassifierr

# Load libraries
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
from matplotlib import pyplot as plt

# Load scikit-learn libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.svm import LinearSVC,LinearSVR
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
iot = pd.read_csv('dataset/iot.csv')

In [3]:
# Create separate columns
numeric_cols = iot[['temp']]
ngram_cols = iot[['room_id/id','noted_date']]
target = iot['out/in']

# Create ngram vectorizer
vectorizer = CountVectorizer(ngram_range = (2,2),analyzer = 'char')

# Create dataframe
table = pd.DataFrame()
table = pd.concat([table, numeric_cols], axis = 1, sort = False)

# Iterate through each ngram column
for col in ngram_cols.columns:
    # Transform values in column as strings
    array = ngram_cols[col].astype(str).values
    X = vectorizer.fit_transform(array)
    dataframe = pd.DataFrame(X.toarray())
    table = pd.concat([table, dataframe], axis = 1, sort = False)

In [4]:
# Logistic Regression
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = LogRegClassifier(table, target)

(46850, 103)
(15617, 103)
0.8386508076264267
0.8363322020874688
0.8380288904825325
(46850, 103)
(15617, 103)
0.836201514399603
0.8372286610744701
0.8385923573404365
(46850, 103)
(15617, 103)
0.8370499623801367
0.8344752513286803
0.8372605265853909
(46850, 103)
(15617, 103)
0.8376903004786528
0.8371646282896843
0.8386435816002459
(46851, 103)
(15617, 103)
0.8353396939232888
0.8366419057377049
0.8369019567667247
5 fold Train, Validation, and Test Accuracies:
[0.8386508076264267, 0.836201514399603, 0.8370499623801367, 0.8376903004786528, 0.8353396939232888]
[0.8363322020874688, 0.8372286610744701, 0.8344752513286803, 0.8371646282896843, 0.8366419057377049]
[0.8380288904825325, 0.8385923573404365, 0.8372605265853909, 0.8386435816002459, 0.8369019567667247]
Avg Train, Validation, and Test Accuracies:
0.8369864557616216
0.8363685297036018
0.837885462555066


In [5]:
# Random Forest
avgsc_train_lst, avgsc_lst, avgsc_hld_lst, cnf_matrix = RandForestClassifier(table, target)

(46850, 103)
(15617, 103)
0.9511742199881538
0.9341102644554011
0.9374039545128573
(46850, 103)
(15617, 103)
0.9505979156994894
0.934814625088045
0.9378137485913328
(46850, 103)
(15617, 103)
0.951766532729281
0.9339821988858296
0.9380698698903801
(46850, 103)
(15617, 103)
0.9501816959354539
0.9364794774924762
0.9375064030324762
(46851, 103)
(15617, 103)
0.9511589934046232
0.9360271516393442
0.9376600758119045
5 fold Train, Validation, and Test Accuracies:
[0.9511742199881538, 0.9505979156994894, 0.951766532729281, 0.9501816959354539, 0.9511589934046232]
[0.9341102644554011, 0.934814625088045, 0.9339821988858296, 0.9364794774924762, 0.9360271516393442]
[0.9374039545128573, 0.9378137485913328, 0.9380698698903801, 0.9375064030324762, 0.9376600758119045]
Avg Train, Validation, and Test Accuracies:
0.9509758715514002
0.9350827435122191
0.9376908103677902


In [None]:
# MLP Classifier & Regressor
avgsc_train_lst, avgsc_lst, avgsc_hld_lst = MLPClassifierr(table, target)

(46850, 103)
(15617, 103)
