In [1]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz
!gunzip cc.de.300.bin.gz

--2025-02-06 11:44:01--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.167.99.69, 3.167.99.24, 3.167.99.38, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.167.99.69|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496980947 (4.2G) [application/octet-stream]
Saving to: ‘cc.de.300.bin.gz’


2025-02-06 11:46:00 (36.2 MB/s) - ‘cc.de.300.bin.gz’ saved [4496980947/4496980947]



In [2]:
!pip install fasttext --quiet

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313465 sha256=2a051509d4d34d9902554794b472db16b6481edb0b94e0ecaa5495a32bbac1ae
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [3]:
import numpy as np
import pandas as pd
import spacy
import pickle
from tqdm import tqdm
from google.colab import drive
import fasttext
import fasttext.util
from google.colab import files

In [4]:
uploaded = files.upload()

Saving PHOENIX-2014-T.dev.corpus.csv to PHOENIX-2014-T.dev.corpus.csv
Saving PHOENIX-2014-T.test.corpus.csv to PHOENIX-2014-T.test.corpus.csv
Saving PHOENIX-2014-T.train.corpus.csv to PHOENIX-2014-T.train.corpus.csv


In [5]:
train_corpus = pd.read_csv('PHOENIX-2014-T.train.corpus.csv', sep='|') #TRAIN
val_corpus = pd.read_csv('PHOENIX-2014-T.dev.corpus.csv', sep='|') #VALIDATION
test_corpus = pd.read_csv('PHOENIX-2014-T.test.corpus.csv', sep='|') #TEST

In [6]:
sorted_train_corpus = train_corpus.sort_values(by='name')
sorted_val_corpus = val_corpus.sort_values(by='name')
sorted_test_corpus = test_corpus.sort_values(by='name')

In [7]:
sorted_train_corpus = sorted_train_corpus.reset_index(drop = True)
sorted_val_corpus = sorted_val_corpus.reset_index(drop = True)
sorted_test_corpus = sorted_test_corpus.reset_index(drop = True)

In [8]:
#Limit the data size to the nearest multiple of batch_size (8)
sorted_val_corpus = sorted_val_corpus.iloc[:512] #519->512
sorted_test_corpus = sorted_test_corpus.iloc[:640] #642->640

In [9]:
train_translations = sorted_train_corpus['translation']
val_translations = sorted_val_corpus['translation']
test_translations = sorted_test_corpus['translation']

In [10]:
!python -m spacy download de_core_news_sm

Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
nlp = spacy.load("de_core_news_sm")

In [11]:
def generate_pseudo_glosses(translation):
    doc = nlp(translation)
    pseudo_gloss = []
    for token in doc:
        if token.pos_ in ["NOUN", "NUM", "ADV", "PRON", "PROPN", "ADJ", "VERB"]: # remove prepositions, conjunctions, articles, etc.
            pseudo_gloss.append(token.lemma_) # Add base form of the word
    return pseudo_gloss

In [12]:
def generate_pseudo_gloss_list(translations):
    pseudo_gloss_list = []
    pseudo_gloss_lists_per_translation = []
    for translation in translations:
        pseudo_glosses = [x.lower() for x in generate_pseudo_glosses(translation)]
        pseudo_gloss_lists_per_translation.append(pseudo_glosses)
        for pseudo_gloss in pseudo_glosses:
          if pseudo_gloss not in pseudo_gloss_list:
              pseudo_gloss_list.append(pseudo_gloss)
    return pseudo_gloss_list, pseudo_gloss_lists_per_translation

In [13]:
ft = fasttext.load_model('cc.de.300.bin') #dimensions: 300

In [14]:
def generate_prototypes(pseudo_gloss_list):
    prototypes = []
    for pseudo_gloss in pseudo_gloss_list:
        try:
            pseudo_gloss_vector = ft.get_word_vector(pseudo_gloss)
            prototypes.append(pseudo_gloss_vector)
        except KeyError:
            print(f"Word '{pseudo_gloss}' not in FastText vocabulary.")
    return prototypes

In [17]:
pseudo_gloss_list, pseudo_gloss_lists_per_train_translation = generate_pseudo_gloss_list(train_translations)
_, pseudo_gloss_lists_per_val_translation = generate_pseudo_gloss_list(val_translations)
_, pseudo_gloss_lists_per_test_translation = generate_pseudo_gloss_list(test_translations)

In [18]:
prototypes = generate_prototypes(pseudo_gloss_list)

In [19]:
#extra prototype for sign transitions or non-sign-related components.
prototypes.append(np.zeros(300))

In [21]:
def generate_labels(pseudo_gloss_list, pseudo_gloss_lists_per_translation):
    labels = []
    for i in tqdm(range(len(pseudo_gloss_lists_per_translation))):
      translation_pseudo_glosses = pseudo_gloss_lists_per_translation[i]
      label = np.zeros(len(pseudo_gloss_list) + 1) # add one label for non-matching glosses
      for pseudo_gloss in translation_pseudo_glosses:
          if pseudo_gloss in pseudo_gloss_list:
              label[pseudo_gloss_list.index(pseudo_gloss)] = 1
      labels.append(label)
    return labels

In [23]:
train_labels = generate_labels(pseudo_gloss_list, pseudo_gloss_lists_per_train_translation)

100%|██████████| 7096/7096 [00:00<00:00, 9278.26it/s]


In [24]:
val_labels = generate_labels(pseudo_gloss_list, pseudo_gloss_lists_per_val_translation)

100%|██████████| 512/512 [00:00<00:00, 9087.80it/s]


In [25]:
test_labels = generate_labels(pseudo_gloss_list, pseudo_gloss_lists_per_test_translation)

100%|██████████| 640/640 [00:00<00:00, 9744.21it/s]


In [26]:
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
with open('/content/drive/MyDrive/dataset/pseudo_gloss_list.pkl', 'wb') as file:
    pickle.dump(pseudo_gloss_list, file)

In [28]:
with open('/content/drive/MyDrive/dataset/train_labels.pkl', 'wb') as file:
    pickle.dump(train_labels, file)

In [29]:
with open('/content/drive/MyDrive/dataset/val_labels.pkl', 'wb') as file:
    pickle.dump(val_labels, file)

In [30]:
with open('/content/drive/MyDrive/dataset/test_labels.pkl', 'wb') as file:
    pickle.dump(test_labels, file)

In [31]:
with open('/content/drive/MyDrive/dataset/prototypes.pkl', 'wb') as file:
    pickle.dump(prototypes, file)

In [32]:
drive.flush_and_unmount()