In [None]:
!pip install fse

In [None]:
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

In [12]:
# loading the data
from google.colab import drive
import json

drive.mount('/content/drive')
matched_file_path = '/content/drive/MyDrive/bidirection_matched_recipes.json'

with open(matched_file_path, 'r') as f:
  data = json.load(f)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# data preprocessing
import spacy

nlp_cn = spacy.load("zh_core_web_sm")
nlp_en = spacy.load("en_core_web_sm")

def corpus_extraction(json_field, nlp):
  corpus, tokens = [], []
  for recipe in data:
    for value in recipe[json_field]:
      corpus.append(value)

  for sent in corpus:
    doc = nlp(sent)
    tokens.append([token.text for token in doc])
  return tokens


In [27]:
cn_tok_sent = corpus_extraction('cn_steps', nlp_cn)
en_tok_sent = corpus_extraction('en_steps', nlp_en)

In [28]:
# fasttext embeddings
from gensim.models import FastText
from fse import Average, IndexedList

def get_fasttext_embed(sentences):
  ft = FastText(sentences, min_count=1, vector_size=10)
  model = Average(ft)
  model.train(IndexedList(sentences))
 
  return model

en_model = get_fasttext_embed(en_tok_sent)
en_model.sv.similarity(0,1)



0.3191586

In [29]:
cn_model = get_fasttext_embed(cn_tok_sent)
cn_model.sv.similarity(0,1)



0.61405194

In [30]:
en_model.wv.most_similar("锅")

[('Checking', 0.8936383724212646),
 ('looking', 0.8662701845169067),
 ('doming', 0.8602674603462219),
 ('Resulting', 0.851852297782898),
 ('According', 0.8464218378067017),
 ('according', 0.8448274731636047),
 ('tilting', 0.842126727104187),
 ('decorating', 0.8418664336204529),
 ('Looking', 0.8418377041816711),
 ('rehydrating', 0.838645875453949)]

In [31]:
cn_model.wv.most_similar("锅")

[('锅里', 0.9457086324691772),
 ('锅里油', 0.9410570859909058),
 ('锅内盖', 0.9372973442077637),
 ('锅里满', 0.9358884692192078),
 ('锅内', 0.934575080871582),
 ('汤锅里', 0.9344434142112732),
 ('锅里煎', 0.9316495656967163),
 ('砂锅', 0.9304372668266296),
 ('置火', 0.9280111789703369),
 ('锅里家', 0.9277089834213257)]

In [19]:
# saving the embeddings in a txt file
from google.colab import files

def save_and_download(lang, sentences, model):
  with open(f'{lang}_token_embeds.txt', "w") as fw:
    for sent in sentences:
      for word in sent:
        embed = model.wv[word]
        fw.write("{}{}\n".format(word, embed))
    files.download(f'{lang}_token_embeds.txt') 


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [21]:
save_and_download('cn', cn_tok_sent, cn_model)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
save_and_download('en', en_tok_sent, en_model)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>