In [None]:
# to reach wandb via google colab
#!pip install wandb -qqq
#!apt install tree

[K     |████████████████████████████████| 1.9 MB 10.8 MB/s 
[K     |████████████████████████████████| 182 kB 53.1 MB/s 
[K     |████████████████████████████████| 173 kB 64.1 MB/s 
[K     |████████████████████████████████| 62 kB 1.7 MB/s 
[K     |████████████████████████████████| 168 kB 59.5 MB/s 
[K     |████████████████████████████████| 168 kB 65.6 MB/s 
[K     |████████████████████████████████| 166 kB 71.9 MB/s 
[K     |████████████████████████████████| 166 kB 69.5 MB/s 
[K     |████████████████████████████████| 162 kB 71.4 MB/s 
[K     |████████████████████████████████| 162 kB 75.6 MB/s 
[K     |████████████████████████████████| 158 kB 74.7 MB/s 
[K     |████████████████████████████████| 157 kB 76.5 MB/s 
[K     |████████████████████████████████| 157 kB 80.7 MB/s 
[K     |████████████████████████████████| 157 kB 76.5 MB/s 
[K     |████████████████████████████████| 157 kB 51.8 MB/s 
[K     |████████████████████████████████| 157 kB 75.3 MB/s 
[K     |█████████████████

In [None]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
import random
import torch
import torchvision
from torch.utils.data import TensorDataset
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from math import sqrt
import math, time
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error
from torch.utils.data import DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## RAW DATA VERSIONING

The raw dataset of Xiaomi stock prices was uploaded to the W&B platform to keep and versioning.

In [None]:
# Ensure deterministic behavior
torch.backends.cudnn.deterministic = True
random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)

# Device configuration
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def load():
    """
    # Load the data
    """

    path="/content/drive/***" # data comes from Google Drive, you can enter your path
    data=pd.read_csv(path+"Data_Xiaomi.csv")
    data=data.copy()
    return data
dataset=load()

# upload dataset as an adtifact to the Weight & Bias platform

with wandb.init(project="DSS",job_type="load-data",name="rawdata-load-XIAOMI"):
  artifact = wandb.Artifact( 'Xiaomi','Data')
  table = wandb.Table(columns=list(dataset.columns), data=dataset.to_numpy())
  artifact.add(table, "rawdata")
  wandb.log_artifact(artifact)

## MISSING DATA HANDLING

In this section, missing values were filled by using 10 days moving average values. 

In [None]:
def preprocess(data):
    """
    ## Prepare the data
    """
    data["Change %"]=data["Change %"].str.replace("%","")
    data["Change %"]=pd.to_numeric(data["Change %"])
    data['Date'] = pd.to_datetime(data.Date)
    data['Volume']=data['Vol.'].apply(lambda x: float(x[:-1]))
    
    return data

In [None]:
#create an empty dataset which includes all days in the time period 
def create_date_table(start='2018-07-10', end='2022-10-31'):
   df = pd.DataFrame({"Date": pd.date_range(start, end)})
   return df

In [None]:
#fill the empty data with 10 days moving average
def fill_missing(data):
  data = data[["Date","Price","Open","High","Low","Change %","Volume"]]
  date_data= create_date_table()
  data['Date'] = pd.to_datetime(data.Date)
  df = date_data.merge(data, left_on='Date', right_on='Date', how='left')
  df=df[["Date","Price","Open","High","Low","Change %","Volume"]]
  for column in df.columns:
    if df[column].dtypes!='<M8[ns]':
      df[column+'_MA']=df[column].rolling(10, min_periods=1).mean()
      df[column].fillna(df[column+'_MA'], inplace=True)
  df['Date']=df['Date'].astype(str)    
  filled_df = df[["Date","Price","Open","High","Low","Change %","Volume"]]
  return filled_df

In [None]:
#log the new version of the dataset to W&B
#the new version the dataset includes the missing data
def preprocess_and_log():

    with wandb.init(project="DSS") as r:
         
        # ✔️ declare which artifact we'll be using
        artifact = r.use_artifact('metu_datacraft/DSS/Xiaomi:v0', type='Data')
        table = artifact.get('rawdata')
        dataset= {"Date": table.get_column("Date"),"Price":table.get_column("Price"),"Open":table.get_column("Open"),
                  "High":table.get_column("High"), "Low":table.get_column("Low") , "Vol.": table.get_column("Vol."),
                  "Change %":table.get_column("Change %")}

        data = pd.DataFrame(dataset)
        preprocessed_data=preprocess(data)
        filled_data=fill_missing(preprocessed_data)
        
        with wandb.init(project="DSS",job_type="preprocess-data",name="filleddata-load-XIAOMI"):
          artifact = wandb.Artifact( 'Xiaomi','Data')
          table = wandb.Table(columns=list(filled_data.columns), data=filled_data.to_numpy())
          artifact.add(table, "filled_data")
          wandb.log_artifact(artifact)
preprocess_and_log()

## DATA VERSION 3 (Removing Weekends)

In this section, the filled days of weekends were removed from the dataset. And it was versioned to the W&B platform as a 3rd version of the dataset.

In [None]:
with wandb.init(project="DSS") as r:
         
        # ✔️ declare which artifact we'll be using
        artifact = r.use_artifact('metu_datacraft/DSS/Xiaomi:v1', type='Data')
        table = artifact.get('filled_data')
        dataset= {"Date": table.get_column("Date"),"Price":table.get_column("Price"),"Open":table.get_column("Open"),
                  "High":table.get_column("High"), "Low":table.get_column("Low") , "Volume": table.get_column("Volume"),
                  "Change":table.get_column("Change %")}

        data = pd.DataFrame(dataset)

        #remove weekends from dataset
        data['Date']=pd.to_datetime(data['Date'])
        data=data[(data['Date'].dt.day_name()!='Saturday') & (data['Date'].dt.day_name()!='Sunday')]
        data['Date']=data['Date'].astype(str)  

        #add to WB
        with wandb.init(project="DSS",job_type="preprocess-data",name="weekdata-load-XIAOMI"):
           artifact = wandb.Artifact( 'Xiaomi','Data')
           table = wandb.Table(columns=list(data.columns), data=data.to_numpy())
           artifact.add(table, "weekday_data")
           wandb.log_artifact(artifact)