In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib widget

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

from core.utils.device import DEVICE
from core.utils.theme import set_theme

set_theme()

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B")
model.to(DEVICE)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B")

In [9]:
import numpy as np
from sklearn.linear_model import LogisticRegression

from core.steering.pca import PCASteering


def projections_to_Xy(projections, langs, num_of_pca_components):
    lang1_data = projections[langs[0]][-1].numpy()
    lang2_data = projections[langs[1]][-1].numpy()
    X = np.vstack([lang1_data, lang2_data])
    y = np.array([1] * lang1_data.shape[0] + [0] * lang2_data.shape[0])
    X = X[:, :num_of_pca_components]

    return X, y


def train_and_score(projections_train, projections_test, num_of_pca_components=1):
    """
    hidden_space_by_language: { [lang]: torch.Tensor([n_layers, n_tokens, d_model]) }
    """
    langs = list(projections_train.keys())
    X_trn, y_trn = projections_to_Xy(projections_train, langs, num_of_pca_components)
    X_tst, y_tst = projections_to_Xy(projections_test, langs, num_of_pca_components)

    classifier = LogisticRegression(random_state=42, penalty="l2")

    classifier.fit(X_trn, y_trn)
    train_accuracy = classifier.score(X_trn, y_trn)
    test_accuracy = classifier.score(X_tst, y_tst)

    print(
        f"# of PCA components: {num_of_pca_components}, train accuracy {train_accuracy:.4f}, test accuracy {test_accuracy:.4f}"
    )

# EN-RU


In [None]:
from core.gather_data.hidden_space import collect_hidden_space_by_language
from core.preprocess_data.flores_plus import load_flores_plus

train_df, test_df = load_flores_plus(["eng_Latn", "rus_Cyrl"], {"eng_Latn": "en", "rus_Cyrl": "ru"}, train_size=50)

test_df = test_df[:100]

hidden_space_by_language_train, token_map_for_language_train = collect_hidden_space_by_language(
    model, tokenizer, train_df, skip_first=True
)
hidden_space_by_language_test, token_map_for_language_test = collect_hidden_space_by_language(
    model, tokenizer, test_df, skip_first=True
)

pca_steering = PCASteering().fit(hidden_space_by_language_train)

projections_train = pca_steering.project(hidden_space_by_language_train)
projections_test = pca_steering.project(hidden_space_by_language_test)

train_and_score(projections_train, projections_test)

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Data len:  50


100%|██████████| 50/50 [00:13<00:00,  3.82it/s]


Data len:  100


100%|██████████| 100/100 [00:25<00:00,  3.96it/s]


# of PCA components: 1, train accuracy 0.9982, test accuracy 0.9914


# EN-CN


In [11]:
from core.gather_data.hidden_space import collect_hidden_space_by_language
from core.preprocess_data.flores_plus import load_flores_plus
from core.steering.pca import PCASteering

train_df, test_df = load_flores_plus(["eng_Latn", "cmn_Hans"], {"eng_Latn": "en", "cmn_Hans": "cn"}, train_size=50)

test_df = test_df[:100]

hidden_space_by_language_train, token_map_for_language_train = collect_hidden_space_by_language(
    model, tokenizer, train_df, skip_first=True
)
hidden_space_by_language_test, token_map_for_language_test = collect_hidden_space_by_language(
    model, tokenizer, test_df, skip_first=True
)

pca_steering = PCASteering().fit(hidden_space_by_language_train)

projections_train = pca_steering.project(hidden_space_by_language_train)
projections_test = pca_steering.project(hidden_space_by_language_test)

train_and_score(projections_train, projections_test)

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Data len:  50


100%|██████████| 50/50 [00:09<00:00,  5.10it/s]


Data len:  100


100%|██████████| 100/100 [00:21<00:00,  4.66it/s]


# of PCA components: 1, train accuracy 0.9808, test accuracy 0.9810


# EN-ES


In [13]:
from core.gather_data.hidden_space import collect_hidden_space_by_language
from core.preprocess_data.flores_plus import load_flores_plus
from core.steering.pca import PCASteering

train_df, test_df = load_flores_plus(["eng_Latn", "spa_Latn"], {"eng_Latn": "en", "spa_Latn": "es"}, train_size=50)

test_df = test_df[:100]

hidden_space_by_language_train, token_map_for_language_train = collect_hidden_space_by_language(
    model, tokenizer, train_df, skip_first=True
)
hidden_space_by_language_test, token_map_for_language_test = collect_hidden_space_by_language(
    model, tokenizer, test_df, skip_first=True
)

pca_steering = PCASteering().fit(hidden_space_by_language_train)

projections_train = pca_steering.project(hidden_space_by_language_train)
projections_test = pca_steering.project(hidden_space_by_language_test)

train_and_score(projections_train, projections_test)

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Data len:  50


100%|██████████| 50/50 [00:13<00:00,  3.76it/s]


Data len:  100


100%|██████████| 100/100 [00:24<00:00,  4.03it/s]


# of PCA components: 1, train accuracy 0.9541, test accuracy 0.9593


# EN-HIN

In [14]:
from core.gather_data.hidden_space import collect_hidden_space_by_language
from core.preprocess_data.flores_plus import load_flores_plus
from core.steering.pca import PCASteering

train_df, test_df = load_flores_plus(["eng_Latn", "cmn_Hans"], {"eng_Latn": "en", "cmn_Hans": "hin"}, train_size=50)

test_df = test_df[:100]

hidden_space_by_language_train, token_map_for_language_train = collect_hidden_space_by_language(
    model, tokenizer, train_df, skip_first=True
)
hidden_space_by_language_test, token_map_for_language_test = collect_hidden_space_by_language(
    model, tokenizer, test_df, skip_first=True
)

pca_steering = PCASteering().fit(hidden_space_by_language_train)

projections_train = pca_steering.project(hidden_space_by_language_train)
projections_test = pca_steering.project(hidden_space_by_language_test)

train_and_score(projections_train, projections_test)

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/220 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/214 [00:00<?, ?it/s]

Data len:  50


100%|██████████| 50/50 [00:11<00:00,  4.45it/s]


Data len:  100


100%|██████████| 100/100 [00:21<00:00,  4.68it/s]


# of PCA components: 1, train accuracy 0.9853, test accuracy 0.9799
