 In particular Ethylene concentration ranges from 0-20 ppm; 0-600 ppm for CO; and 0-300 ppm for Methane. 
 
 https://archive.ics.uci.edu/ml/datasets/Gas+sensor+array+under+dynamic+gas+mixtures#
 
 Note: in order to use this script, you will need to download and extract the data.zip at the above location. I have not pushed it to the github repository.

In [18]:
import pandas as pd
import numpy as np
import pickle
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [19]:
# Load the data from the txt files
# Note: the data should be unzipped

with open('./data/ethylene_CO.txt','rb') as f:
    co = pd.read_table(f,delim_whitespace=True,header=None,skiprows=1,
                               names=['time','co','ethylene','TGS2602', 'TGS2602', 
                                      'TGS2600', 'TGS2600', 'TGS2610', 'TGS2610', 'TGS2620', 
                                      'TGS2620', 'TGS2602', 'TGS2602', 'TGS2600', 'TGS2600', 
                                      'TGS2610', 'TGS2610', 'TGS2620', 'TGS2620'])
    co = co.drop('time', 1)

In [None]:
# Load the data from the txt files
# Note: the data should be unzipped

with open('./data/ethylene_methane.txt','rb') as f:
    methane = pd.read_table(f,delim_whitespace=True,header=None,skiprows=1,
                               names=['time','methane','ethylene','TGS2602', 'TGS2602', 
                                      'TGS2600', 'TGS2600', 'TGS2610', 'TGS2610', 'TGS2620', 
                                      'TGS2620', 'TGS2602', 'TGS2602', 'TGS2600', 'TGS2600', 
                                      'TGS2610', 'TGS2610', 'TGS2620', 'TGS2620'])
    methane = methane.drop('time', 1)

For now, we'll just use the CO dataset

In [20]:
data = co
y = pd.concat([data.pop('co'),data.pop('ethylene')],axis=1)
x = data

In [21]:
# Standard scale the data
# Deprecated
#y = StandardScaler().fit_transform(y)
#x = StandardScaler().fit_transform(x)

In [22]:
# MinMax Scale the data
# Deprecated
x = ((x-x.min())/(x.max()-x.min()))
#y = ((y-y.min())/(y.max()-y.min())).as_matrix()

In [23]:
# Convert pandas dataframes to numpy arrays
x = x.as_matrix()
y = y.as_matrix()

In [24]:
# Generate list of unique pairs of gas concentrations, ie categories
tuples = list(map(tuple, y))
seen = set()
cats = [cat for cat in tuples if cat not in seen and not seen.add(cat)]

In [25]:
# Encode the data with numerical category values in place of gas concentration tuples
y_enc = np.array([cats.index(item) for item in tuples]).reshape(-1,1)

In [30]:
# One hot encode the data
y_one_hot = OneHotEncoder(sparse=False).fit_transform(X=y_enc).astype('int8')

In [32]:
# Train test split the data
x_all, xtest, y_all, ytest = train_test_split(x, y_one_hot, test_size=1000, random_state=0)
xtrain, xval, ytrain, yval = train_test_split(x_all, y_all, test_size=1000, random_state=0)

In [33]:
del y_one_hot,cats,y_enc,seen,tuples,x,y,data,co

In [34]:
# Pickle the data
with open('./data/pickled/xtrain.pickle', 'wb') as f:
    pickle.dump(xtrain, f, pickle.HIGHEST_PROTOCOL)
    del xtrain

In [35]:
with open('./data/pickled/ytrain.pickle', 'wb') as f:
    pickle.dump(ytrain, f, pickle.HIGHEST_PROTOCOL)
    del ytrain

In [36]:
with open('./data/pickled/xtest.pickle', 'wb') as f:
    pickle.dump(xtest, f, pickle.HIGHEST_PROTOCOL)
    del xtest

In [37]:
with open('./data/pickled/ytest.pickle', 'wb') as f:
    pickle.dump(ytest, f, pickle.HIGHEST_PROTOCOL)
    del ytest

In [38]:
with open('./data/pickled/xval.pickle', 'wb') as f:
    pickle.dump(xval, f, pickle.HIGHEST_PROTOCOL)
    del xval

In [39]:
with open('./data/pickled/yval.pickle', 'wb') as f:
    pickle.dump(yval, f, pickle.HIGHEST_PROTOCOL)
    del yval