In [23]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/NLP


In [24]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import v_measure_score
from sklearn.cluster import KMeans, DBSCAN
import numpy as np
from torch.utils.data import TensorDataset
import torch.nn.functional as F
import random

In [25]:
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
  device = torch.device('cpu')

In [26]:
data = pd.read_csv("bibledata.csv")

In [27]:
means = torch.load("bible_similar_mean.pt", map_location = device)
embeddings = torch.load("bible_embeddings.pt", map_location = device)

styles = embeddings - means
target = F.normalize(styles)

In [28]:
similarity_matrix = embeddings.matmul(embeddings.T)

In [29]:
top_n = similarity_matrix.sort()[1][:,-7:-1]

In [30]:
random.seed(42)
dataset = TensorDataset(embeddings, target, torch.arange(len(data)).to(device))
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True, drop_last = True)

In [31]:
l1 = 256
l2 = 128
l3 = 64
z_dim = 32

In [32]:
class L2Norm(nn.Module):
    def __init__(self, dim=1, eps=1e-12):
        super(L2Norm, self).__init__()
        self.dim = dim
        self.eps = eps

    def forward(self, x):
        self.dim = len(x.shape) - 1
        return F.normalize(x, p=2, dim=self.dim, eps=self.eps)

In [33]:
def Encoder(x_dim, z_dim):
  model = nn.Sequential(
    nn.Linear(x_dim, l1),
    nn.ReLU(),
    nn.Linear(l1, l2),
    nn.ReLU(),
    nn.Linear(l2, l3),
    nn.ReLU(),
    nn.Linear(l3, z_dim))
  return model

In [34]:
def Decoder(x_dim, z_dim):
  model = nn.Sequential(
  nn.Linear(z_dim, l3),
    nn.ReLU(),
    nn.Linear(l3, l2),
    nn.ReLU(),
    nn.Linear(l2, l1),
    nn.ReLU(),
    nn.Linear(l1, x_dim),
    L2Norm())
  return model

In [35]:
class Model(nn.Module):
  def __init__(self, x_dim, z_dim, eps = 0.01):
      super(Model, self).__init__()
      self.Encoder = Encoder(x_dim, z_dim)
      self.Decoder = Decoder(x_dim, z_dim)
      self.eps = eps
  
  def forward(self, x):
      x += self.eps*torch.randn(x.shape).to(device)
      z = self.Encoder(x)
      out = self.Decoder(z)
        
      return out

In [36]:
epochs = 50
batch_size = 64

In [37]:
model = Model(384, z_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [38]:
for epoch in tqdm(range(epochs)):
    overall_loss = 0
    for batch_idx, (x, label, index) in enumerate(train_dataloader):
        x = x.view(batch_size, 384)
        nearest = embeddings[top_n[index]]

        optimizer.zero_grad()
        predict = model(x)
        nears = model.Encoder(nearest)

        loss = torch.sum(torch.square(1 - F.cosine_similarity(predict, label))) + torch.sum(torch.square(F.cosine_similarity(model.Encoder(x).unsqueeze(1), nears, dim=2)))/6
      
        overall_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    print("\tEpoch", epoch + 1, "complete!", "\tAverage Loss: ", overall_loss / (batch_idx*64))

  2%|▏         | 1/50 [00:00<00:48,  1.01it/s]

	Epoch 1 complete! 	Average Loss:  1.3445456523493111


  4%|▍         | 2/50 [00:02<00:48,  1.02s/it]

	Epoch 2 complete! 	Average Loss:  0.962127679801849


  6%|▌         | 3/50 [00:04<01:09,  1.48s/it]

	Epoch 3 complete! 	Average Loss:  0.8694246888878834


  8%|▊         | 4/50 [00:06<01:27,  1.90s/it]

	Epoch 4 complete! 	Average Loss:  0.8168718351657132


 10%|█         | 5/50 [00:08<01:22,  1.83s/it]

	Epoch 5 complete! 	Average Loss:  0.7723173157996442


 12%|█▏        | 6/50 [00:10<01:25,  1.94s/it]

	Epoch 6 complete! 	Average Loss:  0.7319485376398247


 14%|█▍        | 7/50 [00:12<01:19,  1.85s/it]

	Epoch 7 complete! 	Average Loss:  0.7002001806914088


 16%|█▌        | 8/50 [00:13<01:16,  1.82s/it]

	Epoch 8 complete! 	Average Loss:  0.6738692827253456


 18%|█▊        | 9/50 [00:15<01:15,  1.83s/it]

	Epoch 9 complete! 	Average Loss:  0.6528577129524874


 20%|██        | 10/50 [00:18<01:21,  2.03s/it]

	Epoch 10 complete! 	Average Loss:  0.6360258607261152


 22%|██▏       | 11/50 [00:21<01:29,  2.30s/it]

	Epoch 11 complete! 	Average Loss:  0.6230306111904512


 24%|██▍       | 12/50 [00:22<01:12,  1.90s/it]

	Epoch 12 complete! 	Average Loss:  0.6092233955860138


 26%|██▌       | 13/50 [00:23<00:59,  1.61s/it]

	Epoch 13 complete! 	Average Loss:  0.5937920198383102


 28%|██▊       | 14/50 [00:24<00:51,  1.42s/it]

	Epoch 14 complete! 	Average Loss:  0.5843610253678747


 30%|███       | 15/50 [00:24<00:44,  1.28s/it]

	Epoch 15 complete! 	Average Loss:  0.5736222773431295


 32%|███▏      | 16/50 [00:25<00:40,  1.18s/it]

	Epoch 16 complete! 	Average Loss:  0.5617665323507355


 34%|███▍      | 17/50 [00:26<00:36,  1.11s/it]

	Epoch 17 complete! 	Average Loss:  0.559093943202352


 36%|███▌      | 18/50 [00:27<00:34,  1.07s/it]

	Epoch 18 complete! 	Average Loss:  0.5519211782389376


 38%|███▊      | 19/50 [00:28<00:31,  1.03s/it]

	Epoch 19 complete! 	Average Loss:  0.5412515024822878


 40%|████      | 20/50 [00:29<00:30,  1.00s/it]

	Epoch 20 complete! 	Average Loss:  0.5318228399538132


 42%|████▏     | 21/50 [00:30<00:28,  1.02it/s]

	Epoch 21 complete! 	Average Loss:  0.5267020695539842


 44%|████▍     | 22/50 [00:31<00:28,  1.03s/it]

	Epoch 22 complete! 	Average Loss:  0.5208648461175253


 46%|████▌     | 23/50 [00:33<00:30,  1.12s/it]

	Epoch 23 complete! 	Average Loss:  0.5182867183024624


 48%|████▊     | 24/50 [00:34<00:30,  1.18s/it]

	Epoch 24 complete! 	Average Loss:  0.509480078895408


 50%|█████     | 25/50 [00:35<00:28,  1.14s/it]

	Epoch 25 complete! 	Average Loss:  0.5077911004962692


 52%|█████▏    | 26/50 [00:36<00:26,  1.09s/it]

	Epoch 26 complete! 	Average Loss:  0.5026912127273628


 54%|█████▍    | 27/50 [00:37<00:24,  1.04s/it]

	Epoch 27 complete! 	Average Loss:  0.4946322949176811


 56%|█████▌    | 28/50 [00:38<00:22,  1.02s/it]

	Epoch 28 complete! 	Average Loss:  0.48988947182534687


 58%|█████▊    | 29/50 [00:39<00:21,  1.00s/it]

	Epoch 29 complete! 	Average Loss:  0.4849968209683177


 60%|██████    | 30/50 [00:40<00:19,  1.01it/s]

	Epoch 30 complete! 	Average Loss:  0.4814058368105486


 62%|██████▏   | 31/50 [00:41<00:18,  1.02it/s]

	Epoch 31 complete! 	Average Loss:  0.47584478844361133


 64%|██████▍   | 32/50 [00:42<00:17,  1.03it/s]

	Epoch 32 complete! 	Average Loss:  0.4724971738206335


 66%|██████▌   | 33/50 [00:43<00:16,  1.04it/s]

	Epoch 33 complete! 	Average Loss:  0.46975202625056345


 68%|██████▊   | 34/50 [00:44<00:15,  1.04it/s]

	Epoch 34 complete! 	Average Loss:  0.4655366918767791


 70%|███████   | 35/50 [00:45<00:14,  1.00it/s]

	Epoch 35 complete! 	Average Loss:  0.463374932666859


 72%|███████▏  | 36/50 [00:46<00:15,  1.10s/it]

	Epoch 36 complete! 	Average Loss:  0.45984529910317384


 74%|███████▍  | 37/50 [00:47<00:15,  1.16s/it]

	Epoch 37 complete! 	Average Loss:  0.45690958596855763


 76%|███████▌  | 38/50 [00:48<00:13,  1.14s/it]

	Epoch 38 complete! 	Average Loss:  0.45032110343496484


 78%|███████▊  | 39/50 [00:49<00:12,  1.09s/it]

	Epoch 39 complete! 	Average Loss:  0.4470314607921853


 80%|████████  | 40/50 [00:50<00:10,  1.05s/it]

	Epoch 40 complete! 	Average Loss:  0.44377884143088236


 82%|████████▏ | 41/50 [00:51<00:09,  1.02s/it]

	Epoch 41 complete! 	Average Loss:  0.44160049417650843


 84%|████████▍ | 42/50 [00:52<00:08,  1.00s/it]

	Epoch 42 complete! 	Average Loss:  0.4383501837770623


 86%|████████▌ | 43/50 [00:53<00:06,  1.01it/s]

	Epoch 43 complete! 	Average Loss:  0.43540679887834804


 88%|████████▊ | 44/50 [00:54<00:05,  1.03it/s]

	Epoch 44 complete! 	Average Loss:  0.4317268239087369


 90%|█████████ | 45/50 [00:55<00:04,  1.03it/s]

	Epoch 45 complete! 	Average Loss:  0.42967897061123905


 92%|█████████▏| 46/50 [00:56<00:03,  1.03it/s]

	Epoch 46 complete! 	Average Loss:  0.42896650318639823


 94%|█████████▍| 47/50 [00:57<00:02,  1.03it/s]

	Epoch 47 complete! 	Average Loss:  0.42747125496347266


 96%|█████████▌| 48/50 [00:58<00:02,  1.02s/it]

	Epoch 48 complete! 	Average Loss:  0.4229946301644107


 98%|█████████▊| 49/50 [01:00<00:01,  1.12s/it]

	Epoch 49 complete! 	Average Loss:  0.42074720364019097


100%|██████████| 50/50 [01:01<00:00,  1.23s/it]

	Epoch 50 complete! 	Average Loss:  0.41704668086695384





In [39]:
zs = model.Encoder(embeddings).detach().cpu()
km = KMeans(n_clusters = 7)
km.fit(zs)
data["labels"] = km.labels_



In [40]:
v_measure_score(data.id, data.labels)

0.2829570355375063

In [41]:
km.fit(styles.cpu().detach())
data["ll"] = km.labels_
v_measure_score(data.id, data.ll)



0.31213204182992954