In [1]:
#This code uses a best feature select of 12 and calculates the results for using
#a moving average plus the original data in the feature set

from sklearn.datasets import fetch_openml
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import sklearn.linear_model as skl_lm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score

from sklearn.model_selection import train_test_split, LeaveOneOut, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures

from sklearn import utils
from sklearn import preprocessing
from sklearn import neighbors
from pandas import read_csv

from sklearn.feature_selection import SelectKBest, f_classif

from sklearn import preprocessing

from copy import deepcopy

import csv

#This part of the code defines the fit models  

lda = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
kfld = KFold(n_splits=3,random_state=42)

In [2]:
#Reads data in from data folder.

data = read_csv('data/ethylene_methane.txt',delim_whitespace=True)
data.head(7)

Unnamed: 0,Time(sec),Methane(ppm),Ethylene(ppm),sensor1,sensor2,sensor3,sensor4,sensor5,sensor6,sensor7,sensor8,sensor9,sensor10,sensor11,sensor12,sensor13,sensor14,sensor15,sensor16
0,0.0,0.0,0.0,-41.98,2067.64,-37.13,2.28,8.63,-26.62,-8.46,-0.33,3437.73,2728.14,4054.03,4007.89,4478.27,5056.98,3639.09,3128.49
1,0.01,0.0,0.0,-46.5,2067.88,-28.56,13.69,-12.35,-25.81,-5.04,-5.04,3432.44,2734.47,4038.62,4019.4,4496.72,5051.81,3636.97,3115.03
2,0.02,0.0,0.0,-36.16,2055.81,-10.89,8.63,-2.93,-30.34,-9.27,-2.12,3438.61,2719.97,4030.92,4025.48,4489.54,5057.35,3641.81,3105.24
3,0.03,0.0,0.0,-50.36,2053.68,-31.96,-0.65,-8.29,-21.6,7.98,2.28,3429.51,2720.5,4040.22,4000.87,4485.44,5049.6,3642.72,3124.84
4,0.04,0.0,0.0,-37.3,2081.17,-36.16,3.26,5.05,-26.14,-7.48,-0.65,3436.85,2719.71,4029.64,4007.25,4499.12,5057.35,3674.3,3147.59
5,0.05,0.0,0.0,-48.43,2058.64,-32.61,18.59,9.61,-28.89,-12.19,0.65,3463.35,2748.74,4037.97,4023.88,4486.47,5058.09,3650.6,3111.96
6,0.06,0.0,0.0,-46.17,2065.99,-33.74,7.82,-2.12,-38.1,10.59,2.93,3426.87,2731.83,4027.72,3996.72,4507.33,5056.61,3644.54,3122.32


In [3]:
#creates a new attribute thats the % Methane, in cases where we divide by zero
#we recieve a NAN which is replaced by a 0
#the 4th line drops that attributes that we defined to be unnecasry from our K
#best select we ran in a previous part of our code
#this code also finds all unique PPM for methane

data['Methane%'] = (data['Methane(ppm)']/(data['Methane(ppm)']+data['Ethylene(ppm)']))
data['Methane%'] = data['Methane%'].fillna(0)

ppm = data['Methane(ppm)'].unique()

data = data.drop(columns = ['Time(sec)', 'sensor1', 'sensor2','sensor9','sensor10'])

sensor_name = ['sensor3', 'sensor4', 'sensor5','sensor6', 'sensor7', 'sensor8', 'sensor11','sensor12', 'sensor13', 'sensor14', 'sensor15', 'sensor16']


In [4]:
#This block searches for Outliers for each PPM of Methane and then drops them
#from the file.


for j in sensor_name:
    print(j)
    for i in np.sort(ppm):    
        data2 = data[data["Methane(ppm)"] == i] #only select a specific PPM to begin trimming
        
        results = data2[j].quantile([0.125,0.875]) #Next lines calculate the range for non-Outliers
        IQR_15 = 1.5*(results.loc[0.875] - results.loc[0.125])
        fq = results.loc[0.125] - IQR_15
        tq = results.loc[0.875] + IQR_15
                
        index = data2[(fq>data2[j]) | (tq<data2[j])] #Finds Outliers
        data = data.drop(index.index) #Uses index of Outliers to drop rows from data 
        

sensor3
sensor4
sensor5
sensor6
sensor7
sensor8
sensor11
sensor12
sensor13
sensor14
sensor15
sensor16


In [5]:
#This takes each remaining sensor and computes a moving average of 6
#and drops any NAN values that get created

df = deepcopy(data)

for k in sensor_name:
    data[k+'_avg6'] = df[k].rolling(window=6).mean()
data = data.dropna()

In [6]:
#Defines our X and y and sets y equal to 1 if the Methane % is greater than 50%


X_best = data.drop(columns = ['Methane(ppm)','Ethylene(ppm)','Methane%'])
y = np.where(data['Methane%'] >= 0.5, 1, 0)

start =time.time()
X_train, X_test, y_train, y_test = train_test_split(X_best, y, test_size=0.50, random_state=42) 
end = time.time()
print('Time (sec) = ',(end-start))

Time (sec) =  1.325202226638794


In [7]:
#these next blocks of code are the model fitting, each block attempts to fit 
#one model and records the start time starting at the fitting and ends after 
#finishing the CV
#The score and time printed once the code finishes


start =time.time()
clf = LogisticRegression(random_state=42, solver='lbfgs',max_iter=10000,n_jobs = -1).fit(X_train, y_train)
score = cross_val_score(clf, X_test, y_test, cv=kfld, scoring='r2').mean()
end = time.time()
print(score)
print('Time (sec) = ',(end-start))

0.9201802111880616
Time (sec) =  825.3219792842865


In [8]:
start =time.time()
lda.fit(X_train, y_train)
score = cross_val_score(lda, X_test, y_test, cv=kfld, scoring='r2').mean()
end = time.time()
print(score)
print('Time (sec) = ',(end-start))

0.9075915697271077
Time (sec) =  20.45657515525818


In [9]:
start =time.time()
qda.fit(X_train, y_train)
score = cross_val_score(qda, X_test, y_test, cv=kfld, scoring='r2').mean()
end = time.time()
print(score)
print('Time (sec) = ',(end-start))

0.8723988987696352
Time (sec) =  8.765960693359375


In [10]:
start =time.time()
knn = neighbors.KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
score = cross_val_score(knn, X_test, y_test, cv=kfld, scoring='r2').mean()
end = time.time()
print(score)
print('Time (sec) = ',(end-start))

0.9844602861559967
Time (sec) =  441.485399723053


In [11]:
#these next lines take the previously fitted models and computes
#recall and precision

clf_y = clf.predict(X_test)
print(recall_score(y_test, clf_y))
print(precision_score(y_test, clf_y))

0.9573546580544733
0.9991935032073128


In [12]:
lda_y = lda.predict(X_test)
print(recall_score(y_test, lda_y))
print(precision_score(y_test, lda_y))

0.9529799739666039
0.996566583832773


In [13]:
qda_y = qda.predict(X_test)
print(recall_score(y_test, qda_y))
print(precision_score(y_test, qda_y))

0.9580580517894605
0.9724639013653973


In [14]:
knn_y = knn.predict(X_test)
print(recall_score(y_test, knn_y))
print(precision_score(y_test, knn_y))

0.995178006700692
0.9968333835526367
