In [None]:
!pip install fse

In [None]:
!python -m spacy download zh_core_web_sm
!python -m spacy download en_core_web_sm

In [None]:
# loading the data
from google.colab import drive
import json

drive.mount('/content/drive')
matched_file_path = '/content/drive/MyDrive/bidirection_matched_recipes.json'

with open(matched_file_path, 'r') as f:
  data = json.load(f)

Mounted at /content/drive


In [None]:
# data preprocessing
import spacy

nlp_cn = spacy.load("zh_core_web_sm")
nlp_en = spacy.load("en_core_web_sm")

def corpus_extraction(json_field):
  corpus, tokens = [], []
  for recipe in data:
    for value in recipe[json_field]:
      corpus.append(value)

  for sent in corpus:
    doc = nlp_en(sent)
    tokens.append([token.text for token in doc])
  return tokens


In [None]:
cn_tok_sent = corpus_extraction('cn_steps')
en_tok_sent = corpus_extraction('en_steps')

In [None]:
# fasttext embeddings
from gensim.models import FastText
from fse import Average, IndexedList

def get_fasttext_embed(sentences):
  ft = FastText(sentences, min_count=1, vector_size=10)
  model = Average(ft)
  model.train(IndexedList(sentences))
  model.sv.similarity(0,1)

en_model = get_fasttext_embed(en_tok_sent)
get_fasttext_embed(en_tok_sent)



0.29277593

In [None]:
cn_model = get_fasttext_embed(cn_tok_sent)
get_fasttext_embed(cn_tok_sent)



0.6123755

In [None]:
en_model.wv.most_similar("锅")

[('looking', 0.7993849515914917),
 ('Rotating', 0.7883195281028748),
 ('rotating', 0.787180483341217),
 ('tilting', 0.7763185501098633),
 ('bubbling', 0.7753626108169556),
 ('decorating', 0.7650951147079468),
 ('precooking', 0.7610251307487488),
 ('eating', 0.7606989145278931),
 ('Looking', 0.7566357851028442),
 ('Checking', 0.754292905330658)]

In [None]:
cn_model.wv.most_similar("锅")

[('锅中', 0.9466560482978821),
 ('锅中焯', 0.9466060996055603),
 ('锅中添', 0.9442524313926697),
 ('锅里', 0.9430060386657715),
 ('锅内', 0.9427359700202942),
 ('锅中调', 0.9415774345397949),
 ('原炒锅', 0.9406287670135498),
 ('锅中水', 0.9404528141021729),
 ('锅里油', 0.9385949969291687),
 ('锅内盖', 0.9381129741668701)]

In [None]:
# saving the embeddings in a txt file
from google.colab import files

def save_and_download(lang, sentences, model):
  with open(f'{lang}_token_embeds.txt', "w") as fw:
    for sent in sentences:
      for word in sent:
        embed = model.wv[word]
        fw.write("{}{}\n".format(word, embed))
    files.download(f'{lang}_token_embeds.txt') 


save_and_download('en', en_tok_sent, en_model)
save_and_download('cn', en_tok_sent, en_model)