In [None]:
!pip install tensorflow \
tensorflow-hub \
tensorflow_text \
spacy sudachipy sudachidict_core

In [None]:
from spacy.lang.en import English
import tensorflow_hub as hub
import numpy as np
import tensorflow_text

In [None]:
en_texts = open('en.txt', 'r').readlines()
zh_texts = open('zh.txt', 'r').readlines()

In [None]:
def sentencize(text):
    text = text.replace('。', '。 ').replace('？', '？ ').replace('」', '」 ').replace('　」', '」')
    sents = []
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp(text)
    for sent in doc.sents:
        sents.append(sent.text.replace('\n', ' ').strip())

    return sents

en_sents = [sentencize(en_text) for en_text in en_texts]
zh_sents = [sentencize(zh_text) for zh_text in zh_texts]

In [None]:
def align(en_sents, zh_sents):
    embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
    # embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
    en_result = embed(en_sents)
    zh_result = embed(zh_sents)
    sims = np.inner(en_result, zh_result)

    costs = np.zeros((len(en_sents)+1, len(zh_sents)+1))
    pointers = np.zeros((len(en_sents)+1, len(zh_sents)+1), dtype=int)

    for i in range(1, len(en_sents)+1):
        costs[i, 0] = costs[i-1, 0] + 1.

    for j in range(1, len(zh_sents)+1):
        costs[0, j] = costs[0, j-1] + 1.

    for i in range(1, len(en_sents)+1):
        for j in range(1, len(zh_sents)+1):
            choices = [
                (costs[i-1, j-1] + (1. - sims[i-1, j-1]), 1),
                (costs[i-1, j] + 1., 2),
                (costs[i, j-1] + 1., 3)
            ]
            best_choice = sorted(choices, key=lambda x: x[0])[0]
            costs[i, j], pointers[i, j] = best_choice

    aligned = []
    i, j = len(en_sents), len(zh_sents)
    while i > 0 or j > 0:
        if pointers[i, j] == 1:
            i -= 1
            j -= 1
            aligned.append((en_sents[i], zh_sents[j]))
        elif pointers[i, j] == 2:
            i -= 1
            aligned.append((en_sents[i], ''))
        elif pointers[i, j] == 3:
            j -= 1
            aligned.append(('', zh_sents[j]))

    aligned.reverse()

    return aligned

In [None]:
alignment = []

for chapter_en, chapter_jp in zip(en_sents, zh_sents):
  for en_sent, zh_sent in align(chapter_en, chapter_jp):
      alignment.append({'en': en_sent, 'jp': zh_sent})

In [None]:
alignment

In [None]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed(sentences_japanese)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sentences = []

for sentence in alignment:
  if len(sentence['jp'].strip()) == 0 or len(sentence['en'].strip()) == 0:
    continue
  sentences.append({'en': sentence['en'], 'jp': sentence['jp']})

input_sentence = input()

input_japanese = " ".join(input_sentence.split('。'))

sentences_english = [" ".join(pair["en"].split()) for pair in sentences]
sentences_japanese = [" ".join(pair["jp"].split()) for pair in sentences]

tfidf_vectorizer = TfidfVectorizer()


input_embedding = embed([input_sentence])

# from scipy.spatial import distance
# print(1 - distance.cosine(embeddings[0], embeddings[1]))
# print(1 - distance.cosine(embeddings[0], embeddings[2]))
# print(1 - distance.cosine(embeddings[1], embeddings[2]))
import tensorflow as tf

cosine_similarities = tf.reduce_sum(tf.multiply(tf.nn.l2_normalize(embeddings, axis=1), tf.nn.l2_normalize(input_embedding, axis=1)), axis=1)

cosine_similarities = cosine_similarities.numpy()


import numpy as np
top_indices = np.argpartition(cosine_similarities, -3)[-3:]
top_indices = top_indices[np.argsort(cosine_similarities[top_indices])][::-1]
most_similar_sentences = [sentences_japanese[i] for i in top_indices]

for i in top_indices:
  print(f' {cosine_similarities[i]}. {sentences_english[i]}\n{sentences_japanese[i]}')

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import fugashi
from sudachipy import Dictionary

# tagger = fugashi.Tagger()

tokenizer = Dictionary().create()


input_sentence = "と、語り始める。"
input_tokens = [m.surface() for m in tokenizer.tokenize(input_sentence)]

model = Word2Vec(input_tokens, min_count=1, vector_size=100, window=5, sg=0)

input_vector = sum(model.wv[word] for word in input_tokens if word in model.wv)

print('before pruning', len(alignment))
pruned = list()

for sentence in alignment:
  if len(sentence['jp'].strip()) == 0 or len(sentence['en'].strip()) == 0:
    continue
  pruned.append({'en': sentence['en'], 'jp': sentence['jp']})

print('after pruning', len(pruned))



import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = embed([
    "the person wear red T-shirt",
    "this person is walking",
    "the boy wear red T-shirt"
    ])

print(embeddings)




similarities = [
    cosine_similarity([input_vector], [sum(model.wv[word.surface()] for word in tokenizer.tokenize(sentence['jp']) if word.surface in model.wv)])[0][0]
    for sentence in pruned
]

most_similar_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:5]

print("The 5 most similar English sentences:")
for idx in most_similar_indices:
    english_sentence = alignment[idx]['en']
    print(english_sentence)



before pruning 1006
after pruning 994
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'int'>
<class 'in

ValueError: ignored

In [None]:
data = []
print("rows:", len(alignment))

prompt=''

for sent in alignment:
  en_sent = sent['en']
  zh_sent = sent['jp']
  if len(en_sent) < 5 or len(zh_sent) < 5:
    continue;

  if en_sent == zh_sent:
    continue;

  # message = {"role": "user", "content":
  #  "Proofread the following text, extremely improving prose and flow: "+en_sent}

  # data.append(message)

  # data.append({"role": "assistant", "content": zh_sent})

  prompt += f'"{en_sent}" with improved prose is "{zh_sent}"\n'

  if len(prompt) > 2000:
    break

print(prompt)

print("rows after clean:", len(data))

In [None]:
import json
fout = open('data.json', 'w')
json.dump(data, fout, indent=2)
fout.close()

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('corpus')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Error loading corpus: Package 'corpus' not found in index


False