<a href="https://colab.research.google.com/github/cliffyster/transformers/blob/main/Transformers_Recipe_Trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install torch
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post7.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post7-py3-none-any.whl size=2950 sha256=fe1b335a1f6560097881df427b648b0f6031123b68baf4c26b9ad98b8cd8ac8f
  Stored in directory: /root/.cache/pip/wheels/c8/9c/85/72901eb50bc4bc6e3b2629378d172384ea3dfd19759c77fd2c
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post7


In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [3]:
df = pd.read_csv("/content/drive/MyDrive/full_dataset.csv")

In [4]:
# Select the first 50,000 rows
df = df.head(50000)

# Select only 'title' and 'NER' columns
df = df[['title', 'NER']]

In [7]:
# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42)  # use any number for random_state

# Select the first 50,000 rows and 'title' and 'NER' columns
df = df[['title', 'NER']].head(50000)

# Process data
df['text'] = df['title'] + ' ' + df['NER'].str.join(' ')

In [9]:
# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased')

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

batch_size = 50  # Set your batch size to a value that your memory can handle

embeddings = []

for i in range(0, len(df['text']), batch_size):
    batch = df['text'][i:i+batch_size].tolist()
    inputs = tokenizer(batch, padding='longest', truncation=True, return_tensors='pt', max_length=128)

    # Compute the embeddings
    with torch.no_grad():
        batch_embeddings = model(**inputs.to(device)).last_hidden_state.mean(dim=1)

    embeddings.append(batch_embeddings)


In [14]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Concatenate all batch embeddings
embeddings = torch.cat(tuple(embeddings), dim=0)

# Convert tensor to numpy array
embeddings_np = embeddings.cpu().numpy()

tsne = TSNE(n_components=3)
embeddings_tsne = tsne.fit_transform(embeddings_np)

# Create a 3D plot
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], embeddings_tsne[:, 2])
plt.show()


ValueError: ignored