In [None]:
import pandas as pd
import torch
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")
dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from arize.pandas.embeddings import EmbeddingGenerator, UseCases
model_name = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
generator = EmbeddingGenerator.from_use_case(
    use_case=UseCases.NLP.SEQUENCE_CLASSIFICATION,
    model_name=model_name,
)

num_labels = dataset['train'].features['label'].num_classes
print(f"Number of labels: {num_labels}")

In [None]:

tokenizer = AutoTokenizer.from_pretrained(model_name)
# todo current model only supports 3 labels
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
# Define the sizes of the datasets you want to create
dataset_sizes = [100, 500, 1000, 5000, 10000, 50000]

nested_datasets = {}
previous_end_index = 0

for size in dataset_sizes:
    # Calculate the end index for the current subset
    end_index = previous_end_index + size
    # Ensure that we do not exceed the total number of rows in the dataset
    if end_index > len(dataset["train"]):
        print(f"Requested size for {size} exceeds available data. Adjusting to maximum available.")
        end_index = len(dataset["train"])

    # Select the subset from the dataset
    subset = dataset["train"].select(range(previous_end_index, end_index))
    nested_datasets[size] = subset

    # Update the previous end index for the next iteration
    previous_end_index = end_index

train_ds = nested_datasets[1000]
test_ds = dataset["test"].select(range(100))

train_df = train_ds.data.to_pandas()
test_df = test_ds.data.to_pandas()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Summary statistics for the training dataset
print("Training Dataset Summary Statistics:")
print(train_df.describe())

# Summary statistics for the test dataset
print("\nTest Dataset Summary Statistics:")
print(test_df.describe())


In [None]:
# Plot distribution of labels in the training dataset
plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=train_df)
plt.title('Label Distribution in Training Data')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Distribution of review lengths in the training dataset
train_df['review_length'] = train_df['content'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(train_df['review_length'], bins=50, kde=True)
plt.title('Distribution of Review Lengths in Training Data')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Most common words in positive and negative reviews
from sklearn.feature_extraction.text import CountVectorizer



# Adjust the function to fix the IndexError
def plot_most_common_words_corrected(df, n=20):
    pos_reviews = df[df['label'] == 1]['content']
    neg_reviews = df[df['label'] == 0]['content']

    vectorizer_pos = CountVectorizer(stop_words='english')
    vectorizer_neg = CountVectorizer(stop_words='english')

    pos_word_count = vectorizer_pos.fit_transform(pos_reviews)
    neg_word_count = vectorizer_neg.fit_transform(neg_reviews)

    pos_sum_words = pos_word_count.sum(axis=0)
    neg_sum_words = neg_word_count.sum(axis=0)

    pos_words_freq = [(word, pos_sum_words[0, idx]) for word, idx in
                      zip(vectorizer_pos.get_feature_names_out(), range(pos_sum_words.shape[1]))]
    neg_words_freq = [(word, neg_sum_words[0, idx]) for word, idx in
                      zip(vectorizer_neg.get_feature_names_out(), range(neg_sum_words.shape[1]))]

    pos_words_freq = sorted(pos_words_freq, key=lambda x: x[1], reverse=True)
    neg_words_freq = sorted(neg_words_freq, key=lambda x: x[1], reverse=True)

    # Plot for positive reviews
    words, freq = zip(*pos_words_freq[:n])
    plt.figure(figsize=(10, 5))
    plt.bar(words, freq)
    plt.title('Most common words in positive reviews')
    plt.xticks(rotation=90)
    plt.show()

    # Plot for negative reviews
    words, freq = zip(*neg_words_freq[:n])
    plt.figure(figsize=(10, 5))
    plt.bar(words, freq)
    plt.title('Most common words in negative reviews')
    plt.xticks(rotation=90)
    plt.show()


# Call the corrected function
plot_most_common_words_corrected(train_df)


In [None]:
# Import necessary libraries
from wordcloud import WordCloud, STOPWORDS


# Generate a word cloud for positive reviews
def generate_word_cloud(text, title):
    wordcloud = WordCloud(width=800, height=800,
                          background_color='white',
                          stopwords=set(STOPWORDS),
                          min_font_size=10).generate(text)

    # Plot the WordCloud image                        
    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.title(title)
    plt.show()


# Concatenate all positive reviews into a single string
pos_reviews_text = " ".join(review for review in train_df[train_df['label'] == 1]['content'])

# Concatenate all negative reviews into a single string
neg_reviews_text = " ".join(review for review in train_df[train_df['label'] == 0]['content'])

# Generate word cloud for positive reviews
generate_word_cloud(pos_reviews_text, "Word Cloud for Positive Reviews")

# Generate word cloud for negative reviews
generate_word_cloud(neg_reviews_text, "Word Cloud for Negative Reviews")


In [None]:
train_df["content_vector"] = generator.generate_embeddings(text_col=train_df["content"])
test_df["content_vector"] = generator.generate_embeddings(text_col=test_df["content"])

In [None]:
from transformers import TrainingArguments

training_batch_size = 16
training_epochs = 3
logging_steps = len(train_df) // training_batch_size

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=training_epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=training_batch_size,
                                  per_device_eval_batch_size=training_batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  optim="adamw_torch"
                                  )


In [None]:
def tokenize(batch, max_length=512):
    return tokenizer(batch['content'], padding=True, truncation=True, max_length=max_length)
   
process_batch_size = 100
max_size = 100 # max length of the tokenized sequence

train_ds = train_ds.map(lambda batch: tokenize(batch, max_size), batched=True, batch_size=process_batch_size)
test_ds = test_ds.map(lambda batch: tokenize(batch, max_size), batched=True, batch_size=process_batch_size)

In [None]:
type(train_ds)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

eval = trainer.evaluate()
eval_df = pd.DataFrame({'Epoch':0, 'Validation Loss': eval['eval_loss'], 'Accuracy': eval['eval_accuracy'], 'F1': eval['eval_f1']}, index=[0])

eval_df

In [None]:
torch.cuda.empty_cache()

print("\nTraining...")
trainer.train()

In [None]:
# ## TODO WIP is broke but ideally I want to extract the embeddings and predictions from the model and add them to the dataset so we can look at them in phoenix
# def postprocess(batch):
#     inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
#     with torch.no_grad():
#         out = model(**inputs)                         # Extract prediction labels
#         pred_label = out.logits.argmax(dim=1)           # Extract embedding vectors
#         hidden_states = torch.stack(out.hidden_states)  # (layer_#, batch_size, seq_length/or/num_tokens, hidden_size)
#         embeddings = hidden_states[-1][:,0,:]           # Select last layer, then CLS token vector
#     return {"text_vector": embeddings.cpu().numpy(), "pred_label": pred_label.cpu().numpy()}
# 
# batch_size = 100
# 
# 
# train_ds.set_format("torch", columns=["input_ids", "attention_mask"])
# train_ds = train_ds.map(postprocess, batched=True, batch_size=batch_size)
# 
# test_ds.set_format("torch", columns=["input_ids", "attention_mask"])
# test_ds = test_ds.map(postprocess, batched=True, batch_size=batch_size)

In [None]:
import phoenix as px

schema = px.Schema(
    actual_label_column_name="label",
    embedding_feature_column_names={
        "text_embedding": px.EmbeddingColumnNames(
            vector_column_name="content_vector", raw_data_column_name="content"
        ),
    },
    ## not working yet
    # prediction_label_column_name="pred_label",
)

test_ds_px = px.Dataset(dataframe=train_df, schema=schema, name="testing")
train_ds_px = px.Dataset(dataframe=test_df, schema=schema, name="training")

session = px.launch_app(primary=test_ds_px, reference=train_ds_px)


In [None]:
px.active_session().view()