In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP

Mounted at /content/drive
/content/drive/MyDrive/NLP


In [7]:
import torch
import torch.nn as nn

In [4]:
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')

In [5]:
embeddings = torch.load("emb.pt").to(device = device)

In [44]:
from torch.utils.data import DataLoader

In [92]:
train_dataloader = DataLoader(embeddings, batch_size=64, shuffle=True)

In [62]:
def q_phi(z_dim=10, x_dim=384):
  model = torch.nn.Sequential(                           
    nn.Linear(x_dim, 192),
    nn.ReLU(),
    nn.Linear(192, 96),
    nn.ReLU(),
    nn.Linear(96, 48),
    nn.ReLU(),
    nn.Linear(48, z_dim)
  )
  return model

In [60]:
def p_theta(z_dim=10, x_dim=384):
  model = torch.nn.Sequential(
    
    nn.Linear(z_dim, 48),
    nn.ReLU(),
    nn.Linear(48, 96),
    nn.ReLU(),
    nn.Linear(96, 192),
    nn.ReLU(),
    nn.Linear(192, x_dim)
  )
  return model        

In [97]:
def vae_loss(x, x_logit, z_mu, z_logvar, beta):
  recon_loss = None
  kl_loss = None

  # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
  recon_loss = nn.functional.cross_entropy(x, x_logit)
  kl_loss = 0.5 * torch.mean(torch.square(z_mu) + torch.exp(z_logvar) - z_logvar - 1, axis = 1)
  # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
  vae_loss = recon_loss + torch.mean(beta * kl_loss)
  return vae_loss

In [89]:
class Model(nn.Module):
  def __init__(self, Encoder_Mu, Encoder_Var, Decoder, z_dim):
        super(Model, self).__init__()
        self.Encoder_Mu = Encoder_Mu
        self.Encoder_Var = Encoder_Var
        self.Decoder = Decoder
        self.z_dim = z_dim
  
  def sample_z(self, mu, log_var):
    eps = torch.randn(mu.size()).to(device)
    samples = mu + torch.exp(0.5 * log_var) * eps
    return samples
  
  def forward(self, x):
      mu = self.Encoder_Mu(x)
      var = self.Encoder_Var(x)
      z = self.sample_z(mu, var)
      x_hat = self.Decoder(z)
        
      return x_hat, mu, var

In [90]:
enc_mu = q_phi(z_dim = 10)
enc_var = q_phi(z_dim = 10)
dec = p_theta(z_dim = 10)
model = Model(enc_mu, enc_var, dec, 10)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [50]:
epochs = 5
beta = 10

In [98]:
for epoch in range(epochs):
    overall_loss = 0
    for batch_idx, x in enumerate(train_dataloader):
        x = x.view(64, 384)
        x = x.to(device)

        optimizer.zero_grad()

        x_hat, mean, log_var = model(x)
        loss = vae_loss(x, x_hat, mean, log_var, beta)
      
        overall_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
    print("\tEpoch", epoch + 1, "complete!", "\tAverage Loss: ", overall_loss / (batch_idx*64))

	Epoch 1 complete! 	Average Loss:  -2.1546967489708317e+20
	Epoch 2 complete! 	Average Loss:  -2.0765088019476156e+21
	Epoch 3 complete! 	Average Loss:  -6.259360711236075e+21
	Epoch 4 complete! 	Average Loss:  -1.2996859283398677e+22
	Epoch 5 complete! 	Average Loss:  -2.117523804032861e+22


In [109]:
zs = torch.zeros(1024, 64, 10)

with torch.no_grad():
  for i, x in enumerate(train_dataloader):
    z_mu = model.Encoder_Mu(x)
    z_var = model.Encoder_Var(x)
    z_sample = model.sample_z(z_mu, z_var)
    zs[i] = z_sample

In [111]:
zs = zs.reshape(65536, 10)

In [113]:
!pip install kmeans_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kmeans_pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans_pytorch
Successfully installed kmeans_pytorch-0.3


In [114]:
from kmeans_pytorch import kmeans

In [115]:
cluster_ids_x, cluster_centers = kmeans(
    X=zs, num_clusters=10, distance='euclidean', device=torch.device('cuda:0')
)

running k-means on cuda:0..


[running kmeans]: 227it [00:10, 21.30it/s, center_shift=0.000000, iteration=227, tol=0.000100]


In [118]:
import pandas as pd

In [120]:
sentences = pd.read_csv("sentences.csv")

In [121]:
data = sentences[0:65536]

In [123]:
data["cluster"] = cluster_ids_x.numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["cluster"] = cluster_ids_x.numpy()


In [128]:
clustered = data.groupby("post_id")['cluster'].agg(lambda x: pd.Series.mode(x)[0])

In [144]:
clustered[0:10]

post_id
0    2
1    1
2    6
3    6
4    6
5    5
6    9
7    1
8    6
9    7
Name: cluster, dtype: int64

In [130]:
apost = pd.read_csv("author_post.csv")

In [134]:
aaa = apost[0:6595]

In [136]:
aaa["cluster"] = clustered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aaa["cluster"] = clustered


In [138]:
from sklearn.metrics import v_measure_score

In [139]:
v_measure_score(aaa["author_id"], aaa["cluster"])

0.031596364080353496

In [142]:
v_measure_score(aaa["author_id"], aaa["ga"])

0.7070172339451275

In [157]:
v_measure_score(aaa["ga"], aaa["cluster"])

0.005019681309241231