In [1]:
import glob
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Beijing Multi-Site Air-Quality

[data link](https://archive.ics.uci.edu/ml/datasets/Beijing+Multi-Site+Air-Quality+Data)

**Pre-Processing**

In [3]:
# Read raw data
files = glob.glob('Raw Data/Beijing Multi-Site Air-Quality/*')
data_dict = {file: pd.read_csv(file) for file in files}

In [4]:
# Remove useless variables
for file in files:
    data_dict[file].drop('No', axis=1, inplace=True)
    data_dict[file].drop('station', axis=1, inplace=True)    

In [5]:
# Encode date time
for file in files:
    time_cols = ['year', 'month', 'day', 'hour']
    idx = pd.to_datetime(data_dict[file][time_cols])
    data_dict[file].index = idx
    data_dict[file].drop(time_cols, axis=1, inplace=True)

In [6]:
# Encode categorical variables
for file in files:
    data_dict[file] = pd.concat([
        pd.get_dummies(data_dict[file]['wd'], prefix='wd'),
        data_dict[file].drop('wd', axis=1),
    ], axis=1)

In [7]:
# Fill nan with median
for file in files:
    data_dict[file] = data_dict[file].fillna(data_dict[file].median(0))

In [8]:
# Min-Max Scaling
for file in files:
    scaler = MinMaxScaler(feature_range=(0, 1))
    data_dict[file] = pd.DataFrame(
        scaler.fit_transform(data_dict[file]),
        index = data_dict[file].index,
        columns = data_dict[file].columns,        
    )

In [9]:
# Check that all columns align
assert(all([all(data_dict[files[0]].columns==data_dict[file].columns) for file in files]))

In [10]:
# Check that all indices align
assert(all([all(data_dict[files[0]].index==data_dict[file].index) for file in files]))

**Create X and Y**

In [11]:
# Data params
lbw = 6  # look-back window
skip = 2  # skip time-steps
y_var = 'PM2.5'  # prediction variable

In [12]:
# Creat data tensor
data = np.array([data_dict[file].values for file in files])
data = np.transpose(data, [1, 0, 2])
data.shape

(35064, 12, 27)

In [13]:
# Generate X and Y from data tensor
y_var_idx = np.argmax(data_dict[file].columns == y_var)
iter_range = range(lbw, data.shape[0], skip)
X = np.array([data[i-lbw:i] for i in iter_range])
Y = np.array([data[i, :, y_var_idx] for i in iter_range])
print(X.shape, Y.shape)

(17529, 6, 12, 27) (17529, 12)


In [14]:
# Save data
with open('ML DATA/air_quality.pkl', 'wb') as file:
    data = (X, Y)
    pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

# Climate Change

[data link](https://www.kaggle.com/berkeleyearth/climate-change-earth-surface-temperature-data)

In [15]:
# read data
data = pd.read_csv('Raw Data/Climate Change/GlobalLandTemperaturesByMajorCity.csv')

In [16]:
# Set index to datetime
data['dt'] = pd.to_datetime(data['dt'], yearfirst=True)

In [17]:
# Filter by country
data = data[data['Country'] == 'India']

In [18]:
# Generate Features
data['UpperTemp'] = data['AverageTemperature'] + data['AverageTemperatureUncertainty']
data['LowerTemp'] = data['AverageTemperature'] - data['AverageTemperatureUncertainty']
data['month'] = -np.cos(data['dt'].dt.month.values*2*np.pi/12)

In [19]:
# Generate tensor slices
t_slice = data.pivot('dt', 'City', 'AverageTemperature').sort_index().resample('1M').first().ffill().bfill()
ut_slice = data.pivot('dt', 'City', 'UpperTemp').sort_index().resample('1M').first().ffill().bfill()
lt_slice = data.pivot('dt', 'City', 'LowerTemp').sort_index().resample('1M').first().ffill().bfill()
m_slice = data.pivot('dt', 'City', 'month').sort_index().resample('1M').first().ffill().bfill()

In [20]:
assert(all(t_slice.index == ut_slice.index) & all(t_slice.columns == ut_slice.columns))
assert(all(t_slice.index == lt_slice.index) & all(t_slice.columns == lt_slice.columns))
assert(all(t_slice.index == m_slice.index) & all(t_slice.columns == m_slice.columns))

In [21]:
# Generate final tensor
scaler = StandardScaler()
data = np.array([
    scaler.fit_transform(t_slice.values), 
    scaler.fit_transform(ut_slice.values), 
    scaler.fit_transform(lt_slice.values), 
    scaler.fit_transform(m_slice.values),
])
data = np.transpose(data, [1, 2, 0])
data.shape

(2613, 14, 4)

In [22]:
# Generate X and Y from data tensor
lbw = 6  # look-back window
skip = 1  # skip time-steps
y_var_idx = 0  # prediction variable
iter_range = range(lbw, data.shape[0], skip)
X = np.array([data[i-lbw:i] for i in iter_range])
Y = np.array([data[i, :, y_var_idx] for i in iter_range])
print(X.shape, Y.shape)

(2607, 6, 14, 4) (2607, 14)


In [23]:
# Save data
with open('ML DATA/climate_change.pkl', 'wb') as file:
    data = (X, Y)
    pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

# UK House Price Index

[data link](http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/Average-prices-Property-Type-2020-10.csv?utm_medium=GOV.UK&utm_source=datadownload&utm_campaign=average_price_property_price&utm_term=9.30_16_12_20)

In [24]:
# Get data
data_url = 'http://publicdata.landregistry.gov.uk/market-trend-data/house-price-index-data/Average-prices-Property-Type-2020-10.csv?utm_medium=GOV.UK&utm_source=datadownload&utm_campaign=average_price_property_price&utm_term=9.30_16_12_20'
data = pd.read_csv(data_url)
data.set_index('Date', inplace=True)
data.index = pd.to_datetime(data.index)
data.sort_index(inplace=True)

In [25]:
# Filter by London
data = data[data['Region_Name'] == 'Liverpool']

In [26]:
# Remove useless variables
data = data.drop(['Region_Name', 'Area_Code'], axis=1)
data = data[[c for c in data.columns if ('Price' not in c) and ('Index' not in c)]]

In [27]:
# Fillna
data.fillna(0, inplace=True)

In [28]:
# Scaling
scaler = StandardScaler()
data = pd.DataFrame(scaler.fit_transform(data), index=data.index, columns=data.columns)

In [29]:
# Formulate the tensor
data_tensor = []
property_types = ['Detached', 'Semi', 'Terraced', 'Flat']
for pt in property_types:
    
    data_slice = data[[c for c in data.columns if pt in c.split('_')[0]]]
    print(data_slice.shape)
    data_tensor.append(data_slice.values)
    
data = np.array(data_tensor)
data = np.transpose(data, [1, 0, 2])
print(data.shape)

(310, 2)
(310, 2)
(310, 2)
(310, 2)
(310, 4, 2)


In [30]:
# Generate X and Y from data tensor
lbw = 6  # look-back window
skip = 1  # skip time-steps
y_var_idx = 0  # prediction variable
iter_range = range(lbw, data.shape[0], skip)
X = np.array([data[i-lbw:i] for i in iter_range])
Y = np.array([data[i, :, y_var_idx] for i in iter_range])
print(X.shape, Y.shape)

(304, 6, 4, 2) (304, 4)


In [31]:
# Save data
with open('ML DATA/house_price.pkl', 'wb') as file:
    data = (X, Y)
    pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)

# Activity Recognition

[data link](http://archive.ics.uci.edu/ml/datasets/Activity+Recognition+system+based+on+Multisensor+data+fusion+%28AReM%29)

**pre-processing**

In [32]:
# Read raw data
data_dict = {}
folders = glob.glob('Raw Data/Activity Recognition Multisensor/*')
for folder in folders:
    
    category = folder.split('/')[-1]
    data_dict[category] = {}
    
    category_files = glob.glob(f'{folder}/*')
    for file in category_files:
        file_df = pd.read_csv(file, header=4, index_col=0)
        file_df.index = file_df.index.rename('time')
        data_n = file.split('/')[-1].replace('.csv', '').replace('dataset', '')
        data_dict[category][data_n] = file_df
        print(category, data_n, file_df.shape)

bending1 7 (480, 6)
bending1 6 (480, 6)
bending1 4 (480, 6)
bending1 5 (480, 6)
bending1 1 (480, 6)
bending1 2 (480, 6)
bending1 3 (480, 6)
walking 7 (480, 6)
walking 6 (480, 6)
walking 4 (480, 6)
walking 5 (480, 6)
walking 1 (480, 6)
walking 2 (480, 6)
walking 3 (480, 6)
walking 10 (480, 6)
walking 11 (480, 6)
walking 13 (480, 6)
walking 12 (480, 6)
walking 15 (480, 6)
walking 14 (480, 6)
walking 8 (480, 6)
walking 9 (480, 6)
bending2 6 (480, 6)
bending2 5 (480, 6)
bending2 1 (480, 6)
bending2 2 (480, 6)
bending2 3 (480, 6)
standing 7 (480, 6)
standing 6 (480, 6)
standing 4 (480, 6)
standing 5 (480, 6)
standing 1 (480, 6)
standing 2 (480, 6)
standing 3 (480, 6)
standing 10 (480, 6)
standing 11 (480, 6)
standing 13 (480, 6)
standing 12 (480, 6)
standing 15 (480, 6)
standing 14 (480, 6)
standing 8 (480, 6)
standing 9 (480, 6)
sitting 7 (480, 6)
sitting 6 (480, 6)
sitting 4 (480, 6)
sitting 5 (480, 6)
sitting 1 (480, 6)
sitting 2 (480, 6)
sitting 3 (480, 6)
sitting 10 (480, 6)
sitting 11

In [33]:
# Format data tensor
lbw = 24
scaler = StandardScaler()
X, Y = [], []
for category in data_dict.keys():
    
    if 'bending' not in category:
    
        df_tensors = []
        for data_n in data_dict[category].keys():

            df = data_dict[category][data_n]
            df = df.fillna(df.mean())

            df1 = df[['avg_rss12']].copy()
            df1['upper_avg_rss12'] = df['avg_rss12'] + df['var_rss12']
            df1['lower_avg_rss12'] = df['avg_rss12'] - df['var_rss12']

            df2 = df[['avg_rss13']].copy()
            df2['upper_avg_rss13'] = df['avg_rss13'] + df['var_rss13']
            df2['lower_avg_rss13'] = df['avg_rss13'] - df['var_rss13']

            df3 = df[['avg_rss23']].copy()
            df3['upper_avg_rss23'] = df['avg_rss23'] + df['var_rss23']
            df3['lower_avg_rss23'] = df['avg_rss23'] - df['var_rss23']

            df_tensor = np.array([
                scaler.fit_transform(df1),
                scaler.fit_transform(df2),
                scaler.fit_transform(df3),
            ])

            df_tensor = np.transpose(df_tensor, [1, 0, 2])  # T x S x F
            df_X = np.array([df_tensor[i-lbw:i] for i in range(lbw, df_tensor.shape[0])])
            df_Y = np.array([category, ] * df_X.shape[0])
            X.append(df_X)
            Y.append(df_Y)
            
X = np.concatenate(X)
Y = np.concatenate(Y)
Y = pd.get_dummies(Y).values
X.shape, Y.shape

((34199, 24, 3, 3), (34199, 5))

In [34]:
# Save data
with open('ML DATA/activity_recognition.pkl', 'wb') as file:
    data = (X, Y)
    pickle.dump(data, file, protocol=pickle.HIGHEST_PROTOCOL)