In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
import warnings
warnings.simplefilter("ignore")

In [3]:
path = os.path.join(f"/home/dftml/Project/Sensor_Gas", "Gas_Sensor/")
os.chdir(path)
files = sorted(os.listdir())

In [93]:
for i in range(len(files)):
    df = pd.read_csv(f'/home/dftml/Project/Sensor_Gas/Gas_Sensor/{files[i]}')
    col = {i : i.split()[0] for i in df.columns}
    df.rename(columns=col, inplace=True)
    
    
    df_actual = df[(df["Flow"] >=235) & (df["Flow"] <=246)]
    df_actual = df_actual[df_actual["Temperature"] >= 21]
    
    df_actual["Heater"] = df_actual["Heater"].apply(lambda x: round(x,1))
    df_actual["Flow"] = df_actual["Flow"].apply(lambda x: int(x))
    df_actual["Humidity"] = df_actual["Humidity"].apply(lambda x: int(x))
    df_actual.drop("Time", axis =1, inplace =True)
    
    df_actual["CO"].where(~((df["CO"]>=0) & (df["CO"]<6)),"Low", inplace=True)
    df_actual["CO"].where(~((df["CO"]>=6) & (df["CO"]<13)),"Moderate", inplace=True)
    df_actual["CO"].where(~(df["CO"]>=13),"High", inplace=True)
    
    df_actual["Sensor_1"] = (df_actual["R1"] + df_actual["R2"] + df_actual["R3"] + df_actual["R4"] + df_actual["R5"] + df_actual["R6"] + df_actual["R7"])/7
    df_actual["Sensor_2"] = (df_actual["R8"] + df_actual["R9"] + df_actual["R10"] + df_actual["R11"] + df_actual["R12"] + df_actual["R13"] + df_actual["R14"])/7
    
    df_ind = df_actual[["Humidity","Temperature", "Flow", "Heater", "Sensor_1", "Sensor_2"]]
    df_dep = df_actual["CO"]
    
    exec(f"actual_{i+1} = df_actual")
    exec(f"df_ind{i+1} = df_ind")
    exec(f"df_dep{i+1} = df_dep")

In [95]:
from sklearn.tree import DecisionTreeClassifier

In [101]:
def optimization(X_train,y_train):
    l1 = ["gini", "entropy", "log_loss"]
    l2 = ["best", "random"]
    l3 = list()
    for i in range(0,3,1):
        for j in range(0,2,1):
            d1 = dict()
            tree = DecisionTreeClassifier(criterion=l1[i],splitter=l2[j])
            tree.fit(X_train,y_train)
            d1["criterion"] = l1[i]
            d1["splitter"] = l2[j]
            d1["depth"] = tree.get_depth()
            d1["leaves"] = tree.get_n_leaves()
            d1["train_score"] = tree.score(X_train,y_train)
            
            for k in range(1,14):
                exec(f"d1['testscore_day{k}'] = tree.score(df_ind{k},df_dep{k})")
                
            l3.append(d1)
            
    return l3

In [3]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

cloud_config= {
         'secure_connect_bundle': '/home/dftml/Project/Sensor_Gas/secure-connect-dftml.zip'
}
auth_provider = PlainTextAuthProvider('LfLGbSXqdpAStiWDIRYHRHIO', 'SM1z3dLJ7DuXKF.QNZscq3uo1rMtR0fskvZsg.su-ecdd5srOC.ddfJSj_kvIysqipaNShryYe2c1DMj7sy+fGqbY8A4s3Rm88ieB..qJ5TUZLIuEA9tlJqLHCZvfLDN')

cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session = cluster.connect()

In [233]:
def load_optimization(table : str, X_train, y_train):
    
    optim = pd.DataFrame(optimization(X_train,y_train))
    optim["Deviation"] = np.std(optim.iloc[:,5:], axis=1)
    
    if "index" in list(optim.columns):
        optim.drop("index", axis= 1, inplace = True)
    optim = optim.reset_index()
    
    
    columns_schema = """id int, criterion text,splitter text,depth int,leaves int, train_score float,
       testscore_day1 float, testscore_day2 float,testscore_day3 float,testscore_day4 float,
       testscore_day5 float, testscore_day6 float, testscore_day7 float, testscore_day8 float,
       testscore_day9 float, testscore_day10 float, testscore_day11 float,
       testscore_day12 float,testscore_day13 float, Deviation float, primary key (id)"""
    
    session.execute(f"create table if not exists tree.optim_{table} ({columns_schema})")
    
    
    columns = """id, criterion, splitter, depth,leaves, train_score,
       testscore_day1, testscore_day2, testscore_day3, testscore_day4,
       testscore_day5, testscore_day6, testscore_day7, testscore_day8,
       testscore_day9, testscore_day10, testscore_day11,
       testscore_day12, testscore_day13, Deviation"""
    
    for i in range(len(optim)):
        row = tuple(optim.iloc[i].values)
        session.execute(f"""insert into tree.optim_{table} ({columns}) values {row}""")
        

In [234]:
for i in range(1,14):
    table = "Day" + str(i)
    exec(f"X_train = df_ind{i}")
    exec(f"y_train = df_dep{i}")
    
    load_optimization(table, X_train,y_train)

In [3]:
df = pd.DataFrame(list(session.execute("select * from sensor.tree_optim_day3")))

In [6]:
def best_param(dayno:str):
    
    df_optim = pd.DataFrame(list(session.execute(f"select * from sensor.tree_optim_{dayno}")))
    df_optim["test_mean"] = np.mean(df_optim.iloc[:,5:18], axis = 1)
    df_optim["deviation"] = np.std(df_optim.iloc[:,5:18], axis=1)
    
    # best less deviation predicted parameter from all test datasets
    best_param = df_optim[df_optim["test_mean"] == df_optim["test_mean"].max()]
    
    df_result = best_param[["criterion", "depth", "deviation", "leaves", "splitter", "test_mean", "train_score"]]
    df_result.insert(0,"Day",dayno)
    df_result.insert(0,"id",int(dayno[3:]))
    
    
    columns_schema = """id int, day text, criterion text, depth int, deviation float, leaves int, 
                    splitter text, test_mean float, train_score float, primary key (id) """
    
    session.execute(f"create table if not exists sensor.tree_best_param ({columns_schema})")
    
    columns = """id, day, criterion, depth, deviation, leaves, splitter,
                test_mean, train_score"""
    
    row = tuple(df_result.iloc[0].values)
    session.execute(f"insert into sensor.tree_best_param ({columns}) values {row}")

In [7]:
for i in range(1,14):
    dayno = "day" + str(i)    
    best_param(dayno)

In [4]:
session.execute("use tree")

<cassandra.cluster.ResultSet at 0x7f55e2b93640>

In [4]:
for i in range(1,14):
    
    table = "day" + str(i)
    df = pd.DataFrame(list(session.execute(f"select * from sensor.tree_optim_{table}")))
    df.sort_values(by="id", inplace=True)
    df.set_index("id", inplace=True)
    df.to_csv(f"/home/dftml/Project/Sensor_Gas/Tree/optim_{table}.csv")