# NNPT-7-Chr1-HL10
## NN using PyTorch, 7 environments, Chromosome 1 as cross validation set, 10 hidden layers (decreasing size)

## Import packages

In [None]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)



## Neural net

In [None]:
predictors = pd.read_csv('Input/Predictors_7locs.csv', delim_whitespace=True)
target = pd.read_csv('Input/Target_7locs.csv', delim_whitespace=True)

entire = pd.concat([predictors.reset_index(drop=True), target.reset_index(drop=True)], axis=1, sort=False)
entire

In [None]:
# Convert categorical columns to category dtypes.
cat_cols = ['ann']
for cat in cat_cols:
    entire[cat] = entire[cat].astype('category')

In [None]:
# drop all entries from the chromosome that is later used for Cross Validation
chr = "1_"

# remove first column since anyway a duplicate
entire2 = entire.iloc[:,1:]
entire2

In [None]:
chr1 = entire2[entire2.rs.str.contains('1_',case=False)]
chr1

In [None]:
chr1bool = entire2['rs'].str.contains(chr)
rest = entire2[~chr1bool]
rest


In [None]:
y_train = rest['rFitness']
X_train = rest.iloc[:, :-3].copy()    # without rs and fitness column
X_train;

In [None]:
y_test = chr1['rFitness']
X_test = chr1.iloc[:, :-3].copy()
y_test

In [None]:
# Convert categorical columns to category dtypes.
for cat in cat_cols:
    X_test[cat] = X_test[cat].astype('category')
    X_train[cat] = X_train[cat].astype('category')


In [None]:
# prepare dataset for plotting
y_testlocat = y_test.copy()
y_testlocat = y_testlocat.reset_index()

#tarUKI 131896-148382
#tarSPA 115409-131895
#tarFIN 98922-115408
#tarGER 82435-98921
#tarAND 65948-82434
#tarTHI #49461 -65947
#tarTHP #32974-49460
#tarMLI # 16487-32973
#tarMLP #0-16486

y_testlocat['locat'] = ['MLP' if 0 <=x<= 16486 else 
                        'MLI' if 16487 <=x<= 32973 else 
                        'THP' if 32974 <=x<= 49460 else 
                        'THI' if 49461 <=x<= 65947 else
                        'AND' if 65948 <=x<= 82434 else
                        'GER' if 82435 <=x<= 98921 else
                        'FIN' if 98922 <=x<= 115408 else
                        'SPA' if 115409 <=x<= 131895 else
                        'UKI' for x in y_testlocat['index']]
y_testlocat

In [None]:
y_col = ['rFitness']
X_col = [col for col in X_train.columns if col not in cat_cols + y_col]
cont_cols = [col for col in X_col if col not in cat_cols + y_col]
cont_cols

In [None]:
cats_train = X_train['ann'].values 
cats_test = X_test['ann'].values

In [None]:
# convert categorical vars to tensor
#cats = torch.tensor(cats, dtype=torch.int64) #old version
cats_train = torch.tensor(cats_train, dtype=torch.int64).reshape(-1,1) #new version because of errors in model
cats_test = torch.tensor(cats_test, dtype=torch.int64).reshape(-1,1)

In [None]:
cats_train[:3]

In [None]:
len(cats_train)

In [None]:
# convert continuous variables to a tensor
#conts = np.stack([x[col].values for col in cont_cols], 1) #exchange rest to entire (16.9)
conts_train = np.stack([X_train[col].values for col in cont_cols], 1)
conts_test = np.stack([X_test[col].values for col in cont_cols], 1)

In [None]:
#scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(conts_train)
conts_train = scaler.transform(conts_train)
conts_test = scaler.transform(conts_test)

In [None]:
conts_train = torch.tensor(conts_train, dtype=torch.float)
conts_test = torch.tensor(conts_test, dtype=torch.float)

In [None]:
conts_train

In [None]:
conts_train[:3];

In [None]:
len(conts_train)

In [None]:
len(conts_test)

In [None]:
# Convert labels to a tensor
#y = torch.tensor(entire[y_col].values, dtype=torch.float).reshape(-1,1) #exchange rest to entire (16.9)
y_train = torch.tensor(y_train.to_numpy(dtype=np.float64)).reshape(-1,1)
y_test = torch.tensor(y_test.to_numpy(dtype=np.float64)).reshape(-1,1)

In [None]:
len(y_train)

In [None]:
# This will set embedding sizes for categorical variables
cat_szs = [len(entire[col].cat.categories) for col in cat_cols]
emb_szs = [(size, min(50, (size+1)//2)) for size in cat_szs]
emb_szs

In [None]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs])
        self.emb_drop = nn.Dropout(p)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        layerlist = []
        n_emb = sum((nf for ni,nf in emb_szs))
        n_in = n_emb + n_cont
        
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz))
            
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i])) #because of error, changed from originally (e(x_cat[:,i])) to (e(x_cat[i]))
        x = torch.cat(embeddings,1) #changed from (embeddings, 1) to (embeddings)
        x = self.emb_drop(x)
        
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont],1)
        x = self.layers(x)
        return x

In [None]:
torch.manual_seed(42)
model = TabularModel(emb_szs, conts_train.shape[1], 1, [100, 90, 80, 70, 60, 50, 40, 30, 20, 10], p=0.4)

In [None]:
model

In [None]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
import time
start_time = time.time()

epochs = 400
losses = []

for i in range(epochs):
    i+=1
    y_pred = model(cats_train, conts_train).double() #had to include .double() because of error
    loss = torch.sqrt(criterion(y_pred, y_train)) # RMSE
    losses.append(loss)
    
    # a neat trick to save screen space:
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [None]:
sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
plt.plot(range(epochs), losses)
plt.title("RMSE loss", size= 20, pad=25)
#plt.suptitle("RMSE loss", size = 20)
plt.xlabel("Epoch", size=16)
plt.ylabel("RMSE", size=16)
#plt.savefig('Output/09CVchr_RMSEloss.png', bbox_inches='tight')

In [None]:
# EVALUATE Training set
with torch.no_grad():
    y_pred_train= model(cats_train, conts_train)
    loss = torch.sqrt(criterion(y_pred_train, y_train)) # RMSE
print(f'RMSE of training set: on average, the predicted values are within +/- {loss:.8f} (RMSE) of the actual value.')

In [None]:
from sklearn.metrics import r2_score
# sklearn.metrics.r2_score(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average')[source]
print("R2 score: ", r2_score(y_train, y_pred_train))

In [None]:
#  EVALUATE TEST SET
with torch.no_grad():
    y_pred_test = model(cats_test, conts_test)
    loss = torch.sqrt(criterion(y_test, y_pred_test))
print(f'RMSE: on average, the predicted values are within +/- {loss:.8f} (RMSE) of the actual value.')
print("R2 score: ", r2_score(y_test,y_pred_test))

In [None]:
print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(50):
    diff = np.abs(y_pred_test[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_pred_test[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

In [None]:
#Converting predictions from tensor objects into a list
y_pred_test2plot = [y_pred_test[x].item() for x in range(len(y_pred_test))]
y_test2plot = [y_test[x].item() for x in range(len(y_test))]

In [None]:
# Comparing actual and predicted values
df = {}
df['Actual values'] = y_test2plot
df['Predicted values'] = y_pred_test2plot
df['Location'] = y_testlocat['locat'] ##check
df = pd.DataFrame(df)
df.to_csv('Output/09_pytorch_CVchr_predictedValues.csv')

In [None]:
df = pd.read_csv('Output/09_pytorch_CVchr_predictedValues.csv')
df = df.sort_values('Location')

# Order AND       FIN        GER       MLI        MLP        SPA        THI        THP          UKI
col =['#008cf9','#878500','#00bbad', '#006e00', '#984ea3', '#ff9287', '#b80058', '#ebac23', '#5954d6']

sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
sns.set_palette(col)
s = sns.scatterplot(x='Predicted values', y='Actual values', hue='Location', sizes=(20), data=df) 
plt.title("Neural network | PyTorch | 1,000 - 1,000 SNPs", size= 16, pad=25)
plt.suptitle("Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)
#plt.ylim(min(entire['rFitness']),max(entire['rFitness']))
plt.xlim(-1,3)
plt.ylim(-1,3)
plt.xlabel("Predicted", size=16)
plt.ylabel("Actual", size=16)
plt.setp(s.get_legend().get_texts(), fontsize='16') # for legend text
plt.setp(s.get_legend().get_title(), fontsize='18') # for legend title
#plt.savefig('Output/09chrCV_PredActual.png', bbox_inches='tight')


In [None]:
df = df.sort_values('Location')

In [None]:
# Order AND       FIN        GER       MLI        MLP        SPA        THI        THP          UKI
col =['#008cf9','#878500','#00bbad', '#006e00', '#984ea3', '#ff9287', '#b80058', '#ebac23', '#5954d6']

# plot separated
sns.set_style("whitegrid")
sns.set_palette(col)

p =sns.relplot(
    data=df, x='Predicted values', y='Actual values',
    col="Location", hue="Location",
    kind="scatter", col_wrap=5)
plt.subplots_adjust(top=0.9)
p.fig.suptitle(" NNSL-7-Chr1-HL10 | Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)

#plt.savefig('Output/09chrCV_PredActual_grid.png', bbox_inches='tight')

In [None]:
# Make sure to save the model only after the training has happened!
if len(losses) == epochs:
    torch.save(model.state_dict(), 'pytorch_model_Chr1CV2.pt')
else:
    print('Model has not been trained. Consider loading a trained model instead.')