<a href="https://colab.research.google.com/github/elephanti/NLPProject2024/blob/main/Train_and_Test_with_augmented_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/elephanti/NLPProject2024.git

Cloning into 'NLPProject2024'...
remote: Enumerating objects: 1084, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 1084 (delta 4), reused 5 (delta 2), pack-reused 1072 (from 1)[K
Receiving objects: 100% (1084/1084), 37.16 MiB | 6.09 MiB/s, done.
Resolving deltas: 100% (422/422), done.
Updating files: 100% (629/629), done.


In [2]:
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: kaleido
Successfully installed kaleido-0.2.1


In [3]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import numpy as np

In [6]:
def get_distilbert_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    embeddings = outputs.last_hidden_state
    return embeddings.mean(dim=1).squeeze().detach().numpy()

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
distilbert_model = DistilBertModel.from_pretrained('distilbert-base-cased')

In [9]:
for subset_size in ['5', '10']:
    for dataset in ['ATIS', 'TREC']:
        for model in ['Llama3', 'Mistral', 'GPT2']:

            if model == "Llama3":
                prefix = "Llama3_8B_"
            elif model == "Mistral":
                prefix = "Mistral_7B_"
            elif model == "GPT2":
                prefix = "GPT2_"

            for algo in ['Lambada+', 'Lambada', 'Lambada+Instruct']:

                if algo == 'Lambada+':
                    algo_prefix = 'lambada_plus_'
                elif algo == 'Lambada+Instruct':
                    algo_prefix = 'lambada_plus_instruct_'
                else:
                    algo_prefix = ''

                if algo == 'Lambada+Instruct' and model in ['GPT2', 'Mistral']:
                    continue

                if algo == 'Lambada+' and model == 'GPT2':
                    continue

                if algo == "Lambada+Instruct" and model == 'Llama3':
                    prefix = "Llama_8B_"

                # Load datasets
                sub_dataset_path = f'NLPProject2024/datasets/{dataset}/sampled_subsets/ver1/{dataset.lower()}_{subset_size}_subset.csv'
                filtered_dataset_path = f'NLPProject2024/filtered_datasets/{algo}/{model}/{dataset}/bert_ktrain/{prefix}{algo_prefix}{dataset}_{subset_size}_augmented_data_bert_ktrain_filtered.csv'

                df = pd.read_csv(sub_dataset_path)
                df_generated = pd.read_csv(filtered_dataset_path)

                df['embedding'] = df['text'].apply(get_distilbert_embeddings)
                df_generated['embedding'] = df_generated['text'].apply(get_distilbert_embeddings)

                df['label'] = f"{dataset} {subset_size} subset"
                df_generated['label'] = f"{algo} {model} {dataset} {subset_size}"
                combined_df = pd.concat([df, df_generated])

                embeddings = list(combined_df['embedding'])
                labels = list(combined_df['label'])

                pca = PCA(n_components=min(len(embeddings), 50)).fit(embeddings)
                pca_result = pca.transform(embeddings)

                tsne = TSNE(n_components=2, random_state=0)
                embeddings_2d = tsne.fit_transform(pca_result)

                combined_df['tsne_1'] = embeddings_2d[:, 0]
                combined_df['tsne_2'] = embeddings_2d[:, 1]

                fig = px.scatter(
                    combined_df,
                    x='tsne_1', y='tsne_2',
                    color='label',
                    title=f't-SNE of DistilBERT Embeddings for {dataset} {subset_size} and {model} {algo}',
                    labels={
                        'tsne_1': 't-SNE Component 1',
                        'tsne_2': 't-SNE Component 2'
                    },
                    hover_data={'label': True, 'tsne_1': False, 'tsne_2': False},
                    template='plotly_white',
                    width=900, height=600
                )

                fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))

                output_filename = f'NLPProject2024/embeddings_plots/{dataset}_{subset_size}_{model}_{algo}.png'
                fig.write_image(output_filename)

                fig.show()


In [None]:
for subset_size in ['5', '10']:
    for dataset in ['ATIS', 'TREC']:
        for model in ['Llama3', 'Mistral', 'GPT2']:

            if model == "Llama3":
                prefix = "Llama3_8B_"
            elif model == "Mistral":
                prefix = "Mistral_7B_"
            elif model == "GPT2":
                prefix = "GPT2_"

            for algo in ['Lambada+', 'Lambada', 'Lambada+Instruct']:

                if algo == 'Lambada+':
                    algo_prefix = 'lambada_plus_'
                elif algo == 'Lambada+Instruct':
                    algo_prefix = 'lambada_plus_instruct_'
                else:
                    algo_prefix = ''

                if algo == 'Lambada+Instruct' and model in ['GPT2', 'Mistral']:
                    continue

                if algo == 'Lambada+' and model == 'GPT2':
                    continue

                if algo == "Lambada+Instruct" and model == 'Llama3':
                    prefix = "Llama_8B_"

                # Load datasets
                dataset_path = f'/content/NLPProject2024/datasets/{dataset}/{dataset.lower()}.full.csv'
                filtered_dataset_path = f'NLPProject2024/filtered_datasets/{algo}/{model}/{dataset}/bert_ktrain/{prefix}{algo_prefix}{dataset}_{subset_size}_augmented_data_bert_ktrain_filtered.csv'

                df = pd.read_csv(dataset_path)
                df_generated = pd.read_csv(filtered_dataset_path)

                df['embedding'] = df['text'].apply(get_distilbert_embeddings)
                df_generated['embedding'] = df_generated['text'].apply(get_distilbert_embeddings)

                df['label'] = f"{dataset} {subset_size} subset"
                df_generated['label'] = f"{algo} {model} {dataset} {subset_size}"
                combined_df = pd.concat([df, df_generated])

                embeddings = list(combined_df['embedding'])
                labels = list(combined_df['label'])

                pca = PCA(n_components=min(len(embeddings), 50)).fit(embeddings)
                pca_result = pca.transform(embeddings)

                tsne = TSNE(n_components=2, random_state=0)
                embeddings_2d = tsne.fit_transform(pca_result)

                combined_df['tsne_1'] = embeddings_2d[:, 0]
                combined_df['tsne_2'] = embeddings_2d[:, 1]

                fig = px.scatter(
                    combined_df,
                    x='tsne_1', y='tsne_2',
                    color='label',
                    title=f't-SNE of DistilBERT Embeddings for full {dataset} and {dataset} {subset_size} {model} {algo}',
                    labels={
                        'tsne_1': 't-SNE Component 1',
                        'tsne_2': 't-SNE Component 2'
                    },
                    hover_data={'label': True, 'tsne_1': False, 'tsne_2': False},
                    template='plotly_white',
                    width=900, height=600
                )

                fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey')))

                output_filename = f'NLPProject2024/embeddings_plots/full_{dataset}_{subset_size}_{model}_{algo}.png'
                fig.write_image(output_filename)

                fig.show()
