In [None]:
!pip install fse

In [None]:
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

In [4]:
# loading the data
from google.colab import drive
import json

drive.mount('/content/drive')
matched_file_path = '/content/drive/MyDrive/bidirection_matched_recipes.json'

with open(matched_file_path, 'r') as f:
  data = json.load(f)

Mounted at /content/drive


In [6]:
# data preprocessing
import spacy

nlp_cn = spacy.load("zh_core_web_sm")
nlp_en = spacy.load("en_core_web_sm")

cn_sentences, en_sentences = [], []
for recipe in data:
  for value in recipe['cn_steps']:
    cn_sentences.append(value)

  for value in recipe['en_steps']:
    en_sentences.append(value)

cn_tokens, en_tokens = [], []
for sent in en_sentences:
  doc_en = nlp_en(sent)
  en_tokens.append([token.text for token in doc_en])

for sent in cn_sentences:
  doc_cn = nlp_cn(sent)
  cn_tokens.append([token.text for token in doc_cn])

In [8]:
# fasttext embeddings
from gensim.models import FastText
from fse import Average, IndexedList

en_ft = FastText(en_tokens, min_count=1, vector_size=10)
en_model = Average(en_ft)
en_model.train(IndexedList(en_tokens))
en_model.sv.similarity(0,1)



0.29277593

In [9]:
cn_ft = FastText(cn_tokens, min_count=1, vector_size=10)
cn_model = Average(cn_ft)
cn_model.train(IndexedList(cn_tokens))
cn_model.sv.similarity(0,1)



0.6123755

In [33]:
en_ft.wv.most_similar("锅")

[('looking', 0.7993849515914917),
 ('Rotating', 0.7883195281028748),
 ('rotating', 0.787180483341217),
 ('tilting', 0.7763185501098633),
 ('bubbling', 0.7753626108169556),
 ('decorating', 0.7650951147079468),
 ('precooking', 0.7610251307487488),
 ('eating', 0.7606989145278931),
 ('Looking', 0.7566357851028442),
 ('Checking', 0.754292905330658)]

In [32]:
cn_ft.wv.most_similar("锅")

[('锅中', 0.9466560482978821),
 ('锅中焯', 0.9466060996055603),
 ('锅中添', 0.9442524313926697),
 ('锅里', 0.9430060386657715),
 ('锅内', 0.9427359700202942),
 ('锅中调', 0.9415774345397949),
 ('原炒锅', 0.9406287670135498),
 ('锅中水', 0.9404528141021729),
 ('锅里油', 0.9385949969291687),
 ('锅内盖', 0.9381129741668701)]

In [66]:
# saving the embeddings in a txt file

with open("en_token_embeds.txt", "w") as fw:
  for sent in en_tokens:
    for word in sent:
      embed = en_ft.wv[word]
      fw.write("{}{}\n".format(word, embed))
with open("cn_token_embeds.txt", "w") as fw:
  for sent in cn_tokens:
    for word in sent:
      embed = cn_ft.wv[word]
      fw.write("{}{}\n".format(word, embed))

In [None]:
from google.colab import files
files.download('en_token_embeds.txt') 

In [None]:
from google.colab import files
files.download('cn_token_embeds.txt') 