In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/NLP

Mounted at /content/drive
/content/drive/MyDrive/NLP


In [None]:
!pip install kmeans_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kmeans_pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans_pytorch
Successfully installed kmeans_pytorch-0.3


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from kmeans_pytorch import kmeans
import pandas as pd

In [None]:
USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')

In [None]:
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import v_measure_score

In [None]:
embeddings = torch.load("emb.pt", map_location = torch.device("cpu"))

In [None]:
sentences = pd.read_csv("sentences.csv")
data = sentences[0:65536]

In [None]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence-transformers)
  Downloading huggingface_hub-0.14.1-py3-

In [None]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
model=SentenceTransformer('all-MiniLM-L6-v2')
top_n = 3

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
keys = torch.load("keys.pt")

for i in tqdm(range(len(data))):
    try:
      count = CountVectorizer(ngram_range=(3, 3),stop_words=None).fit([data.text[i]])
      candidates = count.get_feature_names_out()
    except ValueError as e:
      candidates = [""]
    candidate_embeddings = model.encode(candidates)
    distances=cosine_similarity([embeddings.cpu().numpy()[i]], candidate_embeddings)
    keywords_embeddings = np.array([candidate_embeddings[index] for index in distances.argsort()[0][-top_n:]])
    mean_keywords=np.mean(keywords_embeddings,axis=0)
    keys[i] = torch.tensor(mean_keywords)

torch.save(keys, "keys.pt")

100%|██████████| 27311/27311 [26:02<00:00, 17.48it/s]


In [None]:
keys = torch.load("keys.pt")

In [None]:
from torch.utils.data import TensorDataset
dataset = TensorDataset(embeddings, keys)

In [None]:
train_dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
def q_phi(z_dim=10, x_dim=384):
  model = torch.nn.Sequential(                           
    nn.Linear(x_dim, 192),
    nn.ReLU(),
    nn.Linear(192, 96),
    nn.ReLU(),
    nn.Linear(96, 48),
    nn.ReLU(),
    nn.Linear(48, z_dim)
  )
  return model

In [None]:
def p_theta(z_dim=10, x_dim=384):
  model = torch.nn.Sequential(
    
    nn.Linear(z_dim, 48),
    nn.ReLU(),
    nn.Linear(48, 96),
    nn.ReLU(),
    nn.Linear(96, 192),
    nn.ReLU(),
    nn.Linear(192, x_dim)
  )
  return model        

In [None]:
def vae_loss(x, x_logit, z_mu, z_logvar, beta, predict, anti_predict, label):
  recon_loss = nn.functional.cross_entropy(x, x_logit)
  kl_loss = 0.5 * torch.mean(torch.square(z_mu) + torch.exp(z_logvar) - z_logvar - 1, axis = 1)
  prediction_loss = torch.mean(nn.functional.cosine_similarity(predict, label))
  anti_prediction_loss = torch.mean(nn.functional.cosine_similarity(anti_predict, label))
  
  vae_loss = recon_loss + torch.mean(beta * kl_loss) + prediction_loss - anti_prediction_loss
  return vae_loss

In [None]:
class Model(nn.Module):
  def __init__(self, Encoder_Mu, Encoder_Var, Decoder, Predictor, AntiPredictor, z_dim):
        super(Model, self).__init__()
        self.Encoder_Mu = Encoder_Mu
        self.Encoder_Var = Encoder_Var
        self.Decoder = Decoder
        self.Predictor = Predictor
        self.AntiPredictor = AntiPredictor
        self.z_dim = z_dim
  
  def sample_z(self, mu, log_var):
    eps = torch.randn(mu.size())
    samples = mu + torch.exp(0.5 * log_var) * eps
    return samples
  
  def forward(self, x):
      mu = self.Encoder_Mu(x)
      var = self.Encoder_Var(x)
      z = self.sample_z(mu, var)
      x_hat = self.Decoder(z)
      predict = self.Predictor(torch.transpose(torch.transpose(z, 0, 1)[:self.z_dim-2], 0, 1))
      anti_predict = self.AntiPredictor(z)
        
      return x_hat, mu, var, predict, anti_predict

In [None]:
enc_mu = q_phi(z_dim = 10)
enc_var = q_phi(z_dim = 10)
dec = p_theta(z_dim = 10)
predictor = p_theta(z_dim = 8)
anti_predictor = p_theta(z_dim = 10)
model = Model(enc_mu, enc_var, dec, predictor, anti_predictor, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
epochs = 5
beta = 10

In [None]:
for epoch in tqdm(range(epochs)):
    overall_loss = 0
    for batch_idx, (x, label) in tqdm(enumerate(train_dataloader)):
        x = x.view(64, 384)
        #x = x.to(device)

        optimizer.zero_grad()

        x_hat, mean, log_var, predict, anti_predict = model(x)
        loss = vae_loss(x, x_hat, mean, log_var, beta, predict, anti_predict, label)
      
        overall_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
    print("\tEpoch", epoch + 1, "complete!", "\tAverage Loss: ", overall_loss / (batch_idx*64))

  0%|          | 0/5 [00:00<?, ?it/s]
0it [00:00, ?it/s][A
4it [00:00, 34.62it/s][A
9it [00:00, 40.00it/s][A
14it [00:00, 43.23it/s][A
21it [00:00, 52.91it/s][A
27it [00:00, 54.77it/s][A
33it [00:00, 55.15it/s][A
39it [00:00, 56.01it/s][A
45it [00:00, 56.58it/s][A
52it [00:00, 58.27it/s][A
59it [00:01, 60.06it/s][A
66it [00:01, 60.21it/s][A
73it [00:01, 58.90it/s][A
79it [00:01, 57.84it/s][A
85it [00:01, 57.95it/s][A
92it [00:01, 59.36it/s][A
98it [00:01, 58.97it/s][A
104it [00:01, 57.66it/s][A
110it [00:01, 56.89it/s][A
116it [00:02, 56.40it/s][A
122it [00:02, 57.21it/s][A
129it [00:02, 58.36it/s][A
136it [00:02, 59.22it/s][A
143it [00:02, 59.75it/s][A
149it [00:02, 58.49it/s][A
155it [00:02, 58.79it/s][A
162it [00:02, 58.98it/s][A
168it [00:02, 58.84it/s][A
175it [00:03, 59.41it/s][A
181it [00:03, 58.45it/s][A
187it [00:03, 58.17it/s][A
193it [00:03, 57.67it/s][A
199it [00:03, 57.38it/s][A
205it [00:03, 56.52it/s][A
211it [00:03, 52.22it/s][A
217it 

	Epoch 1 complete! 	Average Loss:  -2.5515472775189178e+17



0it [00:00, ?it/s][A
7it [00:00, 60.99it/s][A
14it [00:00, 57.60it/s][A
20it [00:00, 57.37it/s][A
26it [00:00, 57.20it/s][A
32it [00:00, 56.87it/s][A
38it [00:00, 56.82it/s][A
44it [00:00, 57.03it/s][A
50it [00:00, 55.11it/s][A
56it [00:00, 55.43it/s][A
62it [00:01, 55.53it/s][A
68it [00:01, 55.03it/s][A
74it [00:01, 56.34it/s][A
80it [00:01, 56.93it/s][A
86it [00:01, 57.15it/s][A
92it [00:01, 56.79it/s][A
98it [00:01, 54.87it/s][A
104it [00:01, 55.22it/s][A
110it [00:01, 54.68it/s][A
116it [00:02, 53.33it/s][A
122it [00:02, 51.60it/s][A
128it [00:02, 51.87it/s][A
134it [00:02, 52.08it/s][A
140it [00:02, 51.03it/s][A
146it [00:02, 49.86it/s][A
152it [00:02, 49.56it/s][A
157it [00:02, 49.64it/s][A
162it [00:03, 48.74it/s][A
167it [00:03, 48.30it/s][A
172it [00:03, 46.21it/s][A
177it [00:03, 47.22it/s][A
184it [00:03, 53.40it/s][A
192it [00:03, 59.92it/s][A
200it [00:03, 63.90it/s][A
208it [00:03, 66.84it/s][A
216it [00:03, 68.79it/s][A
223it [00:03, 

	Epoch 2 complete! 	Average Loss:  -1.199394355274972e+20



0it [00:00, ?it/s][A
6it [00:00, 56.34it/s][A
12it [00:00, 56.79it/s][A
18it [00:00, 53.78it/s][A
24it [00:00, 51.92it/s][A
30it [00:00, 52.53it/s][A
36it [00:00, 52.42it/s][A
42it [00:00, 53.35it/s][A
48it [00:00, 52.20it/s][A
54it [00:01, 52.58it/s][A
60it [00:01, 52.78it/s][A
66it [00:01, 53.23it/s][A
72it [00:01, 52.96it/s][A
78it [00:01, 51.80it/s][A
84it [00:01, 27.54it/s][A
89it [00:02, 21.35it/s][A
93it [00:02, 21.91it/s][A
96it [00:02, 22.23it/s][A
103it [00:02, 30.02it/s][A
111it [00:02, 38.92it/s][A
119it [00:02, 46.67it/s][A
127it [00:03, 53.15it/s][A
134it [00:03, 56.68it/s][A
142it [00:03, 60.81it/s][A
150it [00:03, 63.80it/s][A
157it [00:03, 64.96it/s][A
164it [00:03, 65.25it/s][A
172it [00:03, 68.15it/s][A
180it [00:03, 69.36it/s][A
188it [00:03, 69.90it/s][A
196it [00:04, 71.49it/s][A
204it [00:04, 70.60it/s][A
212it [00:04, 69.55it/s][A
219it [00:04, 69.39it/s][A
226it [00:04, 69.43it/s][A
233it [00:04, 69.15it/s][A
240it [00:04, 6

	Epoch 3 complete! 	Average Loss:  -2.2445882715596556e+21



0it [00:00, ?it/s][A
7it [00:00, 64.84it/s][A
14it [00:00, 66.75it/s][A
21it [00:00, 67.16it/s][A
29it [00:00, 69.17it/s][A
37it [00:00, 71.13it/s][A
45it [00:00, 70.58it/s][A
53it [00:00, 71.83it/s][A
61it [00:00, 72.60it/s][A
69it [00:00, 71.21it/s][A
77it [00:01, 70.10it/s][A
85it [00:01, 70.92it/s][A
93it [00:01, 71.61it/s][A
101it [00:01, 71.32it/s][A
109it [00:01, 72.75it/s][A
117it [00:01, 71.46it/s][A
125it [00:01, 71.55it/s][A
133it [00:01, 72.20it/s][A
141it [00:01, 70.96it/s][A
149it [00:02, 69.00it/s][A
156it [00:02, 67.84it/s][A
163it [00:02, 67.47it/s][A
170it [00:02, 67.73it/s][A
178it [00:02, 68.73it/s][A
185it [00:02, 68.56it/s][A
193it [00:02, 69.54it/s][A
201it [00:02, 69.97it/s][A
208it [00:02, 68.49it/s][A
216it [00:03, 70.98it/s][A
224it [00:03, 71.54it/s][A
232it [00:03, 70.81it/s][A
240it [00:03, 69.24it/s][A
248it [00:03, 70.27it/s][A
256it [00:03, 71.45it/s][A
264it [00:03, 71.53it/s][A
272it [00:03, 70.84it/s][A
280it [00:

	Epoch 4 complete! 	Average Loss:  -8.4863371179566e+21



0it [00:00, ?it/s][A
7it [00:00, 66.49it/s][A
14it [00:00, 67.14it/s][A
21it [00:00, 67.60it/s][A
29it [00:00, 70.33it/s][A
37it [00:00, 67.42it/s][A
44it [00:00, 66.80it/s][A
51it [00:00, 67.07it/s][A
59it [00:00, 68.57it/s][A
66it [00:00, 68.90it/s][A
73it [00:01, 68.79it/s][A
80it [00:01, 67.84it/s][A
87it [00:01, 67.76it/s][A
94it [00:01, 67.79it/s][A
101it [00:01, 68.12it/s][A
108it [00:01, 68.14it/s][A
115it [00:01, 66.71it/s][A
122it [00:01, 65.13it/s][A
129it [00:01, 64.14it/s][A
137it [00:02, 66.60it/s][A
145it [00:02, 68.85it/s][A
153it [00:02, 70.10it/s][A
161it [00:02, 68.32it/s][A
169it [00:02, 69.58it/s][A
176it [00:02, 69.12it/s][A
183it [00:02, 69.15it/s][A
190it [00:02, 68.91it/s][A
198it [00:02, 69.44it/s][A
206it [00:03, 70.48it/s][A
214it [00:03, 71.69it/s][A
222it [00:03, 72.97it/s][A
230it [00:03, 71.17it/s][A
238it [00:03, 71.53it/s][A
246it [00:03, 71.36it/s][A
254it [00:03, 69.77it/s][A
261it [00:03, 69.72it/s][A
268it [00:0

	Epoch 5 complete! 	Average Loss:  -1.8520119277702097e+22





In [None]:
zs = torch.zeros(1024, 64, 10)

with torch.no_grad():
  for i, (x, label) in enumerate(train_dataloader):
    z_mu = model.Encoder_Mu(x)
    z_var = model.Encoder_Var(x)
    z_sample = model.sample_z(z_mu, z_var)
    zs[i] = z_sample

In [None]:
torch.save(zs, "zs.pt")

In [None]:
zs = torch.load("zs.pt")

In [None]:
zs = zs.reshape(65536, 10)
##zs = torch.transpose(zs, 0, 1)[:8]
##zs = torch.transpose(zs, 0, 1)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters = 10)
kmeans.fit(zs)
cluster_ids_x = torch.tensor(kmeans.labels_)



In [None]:
data["cluster"] = cluster_ids_x.numpy()
clustered = data.groupby("post_id")['cluster'].agg(lambda x: pd.Series.mode(x)[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["cluster"] = cluster_ids_x.numpy()


In [None]:
apost = pd.read_csv("author_post.csv")
aaa = apost[0:6595]
aaa["cluster"] = clustered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aaa["cluster"] = clustered


In [None]:
v_measure_score(aaa["author_id"], aaa["cluster"])

0.03041823057273694