In [53]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import scipy.stats as stats
import copy

#for data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

  set_matplotlib_formats('svg')


In [54]:
#GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [55]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
concrete_compressive_strength = fetch_ucirepo(id=165) 

# data (as pandas dataframes) 
X = concrete_compressive_strength.data.features
y = concrete_compressive_strength.data.targets

In [56]:
print(f'features keys:\n{X.keys()}')

features keys:
Index(['Cement', 'Blast Furnace Slag', 'Fly Ash', 'Water', 'Superplasticizer',
       'Coarse Aggregate', 'Fine Aggregate', 'Age'],
      dtype='object')


In [57]:
# concatenate X & Y into 1 dataframe
data = pd.concat([X, y], axis=1)
print(data)

      Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
0      540.0                 0.0      0.0  162.0               2.5   
1      540.0                 0.0      0.0  162.0               2.5   
2      332.5               142.5      0.0  228.0               0.0   
3      332.5               142.5      0.0  228.0               0.0   
4      198.6               132.4      0.0  192.0               0.0   
...      ...                 ...      ...    ...               ...   
1025   276.4               116.0     90.3  179.6               8.9   
1026   322.2                 0.0    115.6  196.0              10.4   
1027   148.5               139.4    108.6  192.7               6.1   
1028   159.1               186.7      0.0  175.6              11.3   
1029   260.9               100.5     78.3  200.6               8.6   

      Coarse Aggregate  Fine Aggregate  Age  Concrete compressive strength  
0               1040.0           676.0   28                          79.99  
1    

In [58]:
# Step 0: Normalize z-score the data
data = ( data - data.mean() ) / data.std(ddof=1)

In [59]:
dataOrig = copy.deepcopy( data )

In [60]:
### Replace some elements with NAN - method # 1
number2NAN = 10
values2NAN = np.random.choice(len(data), size=number2NAN)
# data['Superplasticizer'][values2NAN] = np.nan
data.loc[values2NAN, 'Superplasticizer'] = np.nan
rowsWithValidData = np.where(~data['Superplasticizer'].isna())[0]

In [61]:
### Replace some elements with NAN - method # 2
# if 'Superplasticizer' in data.columns:
#     # Select 10 random indices from X
#     random_indices = data.sample(n=10, random_state=42).index
#     print(random_indices)
#     data.loc[random_indices, 'Superplasticizer'] = np.nan
#     print(data.loc[random_indices])
# else:
#     print("The 'Superplasticizer' column does not exist in the DataFrame.")

In [62]:
# Step 1: Convert from pandas data to tensor
# Step 2: Split the data into Train & Test

# Superplasticizer column dropped for TRAIN Dataset
cols2keep = data.keys()
cols2keep = cols2keep.drop('Superplasticizer')

#dataframe TRAIN -> train tensor + split data to train data & train labels
train_dataT = torch.tensor( data[cols2keep].values ).float()
train_dataT = train_dataT[ rowsWithValidData ]

train_labelT = torch.tensor( data['Superplasticizer'].values ).float()
train_labelT = train_labelT[ rowsWithValidData, None ]

#dataframe TEST -> Test tensor + split data to train data & train labels
test_dataT = torch.tensor( data[cols2keep].values ).float()
test_dataT = test_dataT[ values2NAN ]

test_labelT = torch.tensor( data['Superplasticizer'].values ).float()
test_labelT = test_labelT[ values2NAN, None ]

In [63]:
# confirm sizes
print(f'original data: {data.shape}')
print(f'Train data   : {train_dataT.shape}')
print(f'Train label  : {train_labelT.shape}')
print(f'Test  data   : {test_dataT.shape}')
print(f'Test label   : {test_labelT.shape}')

original data: (1030, 9)
Train data   : torch.Size([1020, 8])
Train label  : torch.Size([1020, 1])
Test  data   : torch.Size([10, 8])
Test label   : torch.Size([10, 1])
