# Dimensionality Reduction

Perform principle component analysis (PCA) on the data to extract important features.

---

### 1. Load packages and open dataset

In [1]:
import pandas as pd
import numpy as np
from pycaret.clustering import *

In [None]:
point_name = 'upstream' # 'upstream', 'middle', or 'terminus'
df = pd.read_csv('../data/clean/' + point_name + '_velocity_series_clean.csv')
df.head()

### 2. Clustering analysis

Perform clustering analysis on the dataset using kmeans to determine if there are any trends in the data.

In [7]:
# Rename velocityseries to velocity
df = df.rename(columns={'VelocitySeries': 'velocity'})

# create a sequence of numbers
df['series'] = np.arange(1,len(df)+1)
df['time'] = pd.to_datetime(df['time'])

# drop unnecessary columns and re-arrange
df = df[['series', 'time', 'velocity']] 

# check the head of the dataset
df.head()

Unnamed: 0,series,time,velocity
0,1,2015-01-30 12:00:00,2033.7096
1,2,2015-02-11 12:00:00,2026.7332
2,3,2015-02-23 12:00:00,2071.6174
3,4,2015-06-11 12:00:00,2092.837
4,5,2015-06-23 12:00:00,2095.4255


In [8]:
# split data into train-test set
train = df[df['time'] < pd.to_datetime('06-01-2021')]
test = df[df['time'] >= pd.to_datetime('06-01-2021')]

# check shape
print("Shape of training data:", train.shape, "Shape of test data:", test.shape)

# calculate percentage
train_percentage = (len(train) / len(df)) * 100
test_percentage = (len(test) / len(df)) * 100

print(f"Training data percentage: {train_percentage:.2f}%")
print(f"Testing data percentage: {test_percentage:.2f}%")

Shape of training data: (325, 3) Shape of test data: (80, 3)
Training data percentage: 80.25%
Testing data percentage: 19.75%


In [9]:
# initialize setup
s = setup(data = train, 
          numeric_features = ['time', 'series'],
          session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(325, 3)"
2,Transformed data shape,"(325, 5)"
3,Numeric features,2
4,Date features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,CPU Jobs,-1


In [10]:
kmeans = create_model('kmeans')
plot_model(kmeans, plot = 'cluster')

Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.497,534.5362,0.6695,0,0,0


### 3. Repeat for all points

In [23]:

point_names = ['upstream', 'middle', 'terminus']

for i, point in enumerate(point_names):
    df = pd.read_csv('../data/clean/' + point + '_velocity_series_clean.csv')
    
    df = df.rename(columns={'VelocitySeries': 'velocity'})
    df['series'] = np.arange(1,len(df)+1)
    df['time'] = pd.to_datetime(df['time'])
    df = df[['series', 'time', 'velocity']] 
    
    train = df[df['time'] < pd.to_datetime('06-01-2021')]
    test = df[df['time'] >= pd.to_datetime('06-01-2021')]
    print("Shape of training data:", train.shape, "Shape of test data:", test.shape)
    
    train_percentage = (len(train) / len(df)) * 100
    test_percentage = (len(test) / len(df)) * 100
    print(f"Training data percentage: {train_percentage:.2f}%")
    print(f"Testing data percentage: {test_percentage:.2f}%")
    
    s = setup(data = train, 
          numeric_features = ['time', 'series'],
          session_id = 123)
    
    kmeans = create_model('kmeans')
    plot_model(kmeans, plot = 'cluster')

Shape of training data: (250, 3) Shape of test data: (78, 3)
Training data percentage: 76.22%
Testing data percentage: 23.78%


Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(250, 3)"
2,Transformed data shape,"(250, 5)"
3,Numeric features,2
4,Date features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,CPU Jobs,-1


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4513,715.7403,0.7056,0,0,0


Shape of training data: (272, 3) Shape of test data: (92, 3)
Training data percentage: 74.73%
Testing data percentage: 25.27%


Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(272, 3)"
2,Transformed data shape,"(272, 5)"
3,Numeric features,2
4,Date features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,CPU Jobs,-1


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.4411,285.5963,0.8695,0,0,0


Shape of training data: (325, 3) Shape of test data: (80, 3)
Training data percentage: 80.25%
Testing data percentage: 19.75%


Unnamed: 0,Description,Value
0,Session id,123
1,Original data shape,"(325, 3)"
2,Transformed data shape,"(325, 5)"
3,Numeric features,2
4,Date features,1
5,Preprocess,True
6,Imputation type,simple
7,Numeric imputation,mean
8,Categorical imputation,mode
9,CPU Jobs,-1


Unnamed: 0,Silhouette,Calinski-Harabasz,Davies-Bouldin,Homogeneity,Rand Index,Completeness
0,0.497,534.5362,0.6695,0,0,0
