In [1]:
from google.colab import drive

drive.mount('/content/drive/')

path = './drive/MyDrive/Data/'

data_path = path + 'tur.txt'
glove_path = path + 'glove.6B.100d.txt'
tr_vec_path = path + 'cc.tr.300.vec/cc.tr.300.vec'

Mounted at /content/drive/


In [2]:
!pip install translate

Collecting translate
  Downloading translate-3.6.1-py2.py3-none-any.whl (12 kB)
Collecting libretranslatepy==2.1.1 (from translate)
  Downloading libretranslatepy-2.1.1-py3-none-any.whl (3.2 kB)
Installing collected packages: libretranslatepy, translate
Successfully installed libretranslatepy-2.1.1 translate-3.6.1


In [3]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=ad6558078113acc649ba4daf736fd2f993ceaaa5e4c93a450c6b89904686e8c7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [4]:
import numpy as np
import pandas as pd
import re
import io

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding

from tensorflow import nn
from tensorflow import reduce_mean
from tensorflow import int32

from tensorflow.keras import optimizers

from translate import Translator
from rouge_score import rouge_scorer

In [5]:
contractions = {
    r"'m": " am",
    r"isn't": "isnot",
    r"Isn't": "Isnot",
    r"'re": " are",
    r"aren't": "arenot",
    r"Aren't": "Arenot",
    r"'ve": " have",
    r"haven't": "havenot",
    r"hasn't": "hasnot",
    r"Haven't": "Havenot",
    r"Hasn't": "Hasnot",
    r"don't": "donot",
    r"doesn't": "doesnot",
    r"didn't": "didnot",
    r"Don't": "Donot",
    r"Doesn't": "Doesnot",
    r"Didn't": "Didnot",
    r"can't": "cannot",
    r"Can't": "Cannot",
    r"won't": "willnot",
    r"Won't": "Willnot",
    r"'ll": " will",
    r"shouldn't": "shouldnot",
    r"Shouldn't": "Should not",
    r"wouldn't": "wouldnot",
    r"Wouldn't": "Wouldnot",
}

def remove_contractions(text):
  for pattern, replacement in contractions.items():
    text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
  return text

In [18]:
starter_mark = 'startstart '
end_mark = ' endend'

tr_data = []
en_data = []

with open(data_path, 'r', encoding='utf-8') as f:
  lines = f.readlines()

for line in lines:
  line, _ = line.strip().split('\t')
  line = remove_contractions(line)
  line = re.sub(r'[^a-zA-Z]', ' ', line)
  line = ' '.join(word.lower() for word in line.split())
  en_data.append(line)

for line in lines:
  _, line = line.strip().split('\t')
  line = re.compile(r'[^a-zA-ZğüşıöçĞÜŞİÖÇ]').sub(' ', line)
  line = ' '.join(word.lower() for word in line.split())
  line = starter_mark + line + end_mark
  tr_data.append(line)


In [19]:
def create_tokenizer(texts, padding, reverse=False):
    tokenizer = Tokenizer(num_words=None)
    tokenizer.fit_on_texts(texts)

    index_to_word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

    tokens = tokenizer.texts_to_sequences(texts)

    if reverse:
        tokens = [list(reversed(x)) for x in tokens]
        truncating = 'pre'
    else:
        truncating = 'post'

    num_tokens = [len(x) for x in tokens]
    max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
    max_tokens = int(max_tokens)

    tokens_padded = pad_sequences(
        tokens,
        maxlen=max_tokens,
        padding=padding,
        truncating=truncating
      )

    return tokenizer, index_to_word, tokens_padded, max_tokens

In [20]:
def token_to_word(token, index_to_word):
    word = ' ' if token == 0 else index_to_word[token]
    return word

In [21]:
def tokens_to_string(tokens, index_to_word):
    words = [index_to_word[token] for token in tokens if token != 0]
    text = ' '.join(words)
    return text

In [22]:
def text_to_tokens(text, tokenizer, padding, reverse=False, max_tokens=None):
    tokens = tokenizer.texts_to_sequences([text])
    tokens = np.array(tokens)

    if reverse:
        tokens = np.flip(tokens, axis=1)
        truncating = 'pre'
    else:
        truncating = 'post'

    if max_tokens is not None:
        tokens = pad_sequences(
            tokens,
            maxlen=max_tokens,
            padding=padding,
            truncating=truncating
          )

    return tokens

In [23]:
tokenizer_en, index_to_word_en, tokens_padded_en, max_tokens_en = create_tokenizer(
    texts=en_data,
    padding='pre',
    reverse=True,
)

tokenizer_tr, index_to_word_tr, tokens_padded_tr, max_tokens_tr = create_tokenizer(
    texts=tr_data,
    padding='post',
    reverse=False,
)

In [24]:
token_start = tokenizer_tr.word_index[starter_mark.strip()]
token_end = tokenizer_tr.word_index[end_mark.strip()]

encoder_input_data = tokens_padded_en

decoder_input_data = tokens_padded_tr[:, :-1]
decoder_output_data = tokens_padded_tr[:, 1:]

num_encoder_words = len(tokenizer_en.word_index)
num_decoder_words = len(tokenizer_tr.word_index)

In [25]:
fasttext_tr_vec = {}

fin = io.open(tr_vec_path, 'r', encoding='utf-8').readlines()

for line in fin:
  values = line.split()
  word = values[0]
  vec = np.asarray(values[1:], dtype='float32')
  fasttext_tr_vec[word] = vec

fasttext_embedding_matrix = np.random.uniform(-1, 1, (num_decoder_words, 300))

for word, index in tokenizer_tr.word_index.items():
  if index < num_decoder_words:
    ft_embedding_vector = fasttext_tr_vec.get(word)
    if ft_embedding_vector is not None:
      fasttext_embedding_matrix[index] = ft_embedding_vector

word2vec = {}

with open(glove_path, encoding='utf-8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec

glove_embedding_matrix = np.random.uniform(-1, 1, (num_encoder_words, 100))

for word, index in tokenizer_en.word_index.items():
  if index < num_encoder_words:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      glove_embedding_matrix[index] = embedding_vector

In [26]:
num_units = 512

encoder_input = Input(shape=(None,), name='encoder_input')

encoder_embedding = Embedding(
    input_dim=num_encoder_words,
    output_dim=100,
    weights=[glove_embedding_matrix],
    trainable=True,
    name='encoder_embedding')(encoder_input)

encoder_gru1 = GRU(num_units, return_sequences=True)(encoder_embedding)
encoder_gru2 = GRU(num_units, return_sequences=True)(encoder_gru1)
encoder_gru3 = GRU(num_units, return_sequences=True)(encoder_gru2)
encoder_gru4 = GRU(num_units, return_sequences=True)(encoder_gru3)
encoder_gru5 = GRU(num_units, return_sequences=True)(encoder_gru4)
encoder_final_state = GRU(num_units, return_sequences=False)(encoder_gru5)

encoder_output = encoder_final_state

In [27]:
decoder_initial_state = Input(shape=(num_units,), name='decoder_initial_state')
decoder_input = Input(shape=(None,), name='decoder_input')

decoder_embedding = Embedding(
    input_dim=num_decoder_words,
    output_dim=300,
    weights=[fasttext_embedding_matrix],
    trainable=True,
    name='decoder_embedding'
  )(decoder_input)

decoder_layer1 = GRU(num_units, return_sequences=True)
decoder_layer2 = GRU(num_units, return_sequences=True)
decoder_layer3 = GRU(num_units, return_sequences=True)
decoder_layer4 = GRU(num_units, return_sequences=True)
decoder_layer5 = GRU(num_units, return_sequences=True)
decoder_final_layer = GRU(num_units, return_sequences=True)


decoder_dense = Dense(
    num_decoder_words,
    activation='linear',
    name='decoder_output'
)

def connect_decoder(initial_state):
    con_dec = decoder_layer1(decoder_embedding, initial_state=initial_state)
    con_dec = decoder_layer2(con_dec, initial_state=initial_state)
    con_dec = decoder_layer3(con_dec, initial_state=initial_state)
    con_dec = decoder_layer4(con_dec, initial_state=initial_state)
    con_dec = decoder_layer5(con_dec, initial_state=initial_state)
    con_dec = decoder_final_layer(con_dec, initial_state=initial_state)


    decoder_output = decoder_dense(con_dec)

    return decoder_output

model_train = Model(inputs=[encoder_input, decoder_input], outputs=[connect_decoder(initial_state=encoder_output)])
model_encoder = Model(inputs=[encoder_input], outputs=[encoder_output])
decoder_output = connect_decoder(initial_state=decoder_initial_state)

model_decoder = Model(inputs=[decoder_input, decoder_initial_state], outputs=[decoder_output])

In [28]:
def sparse_cross_entropy(y_true, y_pred):
    loss = nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    loss_mean = reduce_mean(loss)
    return loss_mean

decoder_target = Input(shape=(None,), dtype=int32)

model_train.compile(optimizer=optimizers.RMSprop(learning_rate=1e-3), loss=sparse_cross_entropy)

model_train.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 encoder_embedding (Embeddi  (None, None, 100)            2014300   ['encoder_input[0][0]']       
 ng)                                                                                              
                                                                                                  
 gru_12 (GRU)                (None, None, 512)            943104    ['encoder_embedding[0][0]']   
                                                                                                  
 gru_13 (GRU)                (None, None, 512)            1575936   ['gru_12[0][0]']        

In [29]:
X_data = {
    'encoder_input': encoder_input_data,
    'decoder_input': decoder_input_data
}

y_data = {'decoder_output': decoder_output_data}

model_train.fit(
    x=X_data, y=y_data,
    batch_size=256,
    epochs=44
)

Epoch 1/44
Epoch 2/44
Epoch 3/44
Epoch 4/44
Epoch 5/44
Epoch 6/44
Epoch 7/44
Epoch 8/44
Epoch 9/44
Epoch 10/44
Epoch 11/44
Epoch 12/44
Epoch 13/44
Epoch 14/44
Epoch 15/44
Epoch 16/44
Epoch 17/44
Epoch 18/44
Epoch 19/44
Epoch 20/44
Epoch 21/44
Epoch 22/44
Epoch 23/44
Epoch 24/44
Epoch 25/44
Epoch 26/44
Epoch 27/44
Epoch 28/44
Epoch 29/44
Epoch 30/44
Epoch 31/44
Epoch 32/44
Epoch 33/44
Epoch 34/44
Epoch 35/44
Epoch 36/44
Epoch 37/44
Epoch 38/44
Epoch 39/44
Epoch 40/44
Epoch 41/44
Epoch 42/44
Epoch 43/44
Epoch 44/44


<keras.src.callbacks.History at 0x7f68cd889420>

In [30]:
def translate(input_text):
    input_tokens = text_to_tokens(
        text=input_text,
        reverse=True,
        padding='pre',
        tokenizer=tokenizer_en
    )

    initial_state = model_encoder.predict(input_tokens)

    max_tokens = max_tokens_tr

    decoder_input_data = np.zeros(shape=(1, max_tokens), dtype=np.int)

    token_int = token_start
    output_text = ''
    count_tokens = 0

    while token_int != token_end and count_tokens < max_tokens:
        decoder_input_data[0, count_tokens] = token_int
        x_data = {'decoder_initial_state': initial_state, 'decoder_input': decoder_input_data}

        decoder_output = model_decoder.predict(x_data)

        token_onehot = decoder_output[0, count_tokens, :]
        token_int = np.argmax(token_onehot)

        sampled_word = token_to_word(token_int, index_to_word_tr)

        output_text += ' ' + sampled_word
        count_tokens += 1

    return output_text

In [32]:
en_texts = [
    'How are you today?', 'Which road leads to the airport?', "Where did tom go", "Drinking water is important", "I like to do sports",
    'How was your weekend?','They take the bus to work','His dog barks loudly','We always eat dinner together',"I like to read books.",
    "She is a good student.","The sun rises in the east.", "My favorite color is blue.","Do you want to go for a walk?",
    "I enjoy listening to music.","He plays the guitar very well.","They went to the park yesterday.", "It is a beautiful day outside.",
    "Can you pass the salt, please?","I need to buy some groceries.","The cat is sitting on the windowsill.","We are going on vacation next week.",
    "I love to eat pizza and pasta.", "The movie starts at 7:00 PM.", "She always helps her friends.","He has a big family.",
    "I want to learn how to cook.", "The dog is chasing its tail.", "I am going to the store to buy some milk.", "The sky is clear today.",
    "I have a red bicycle.", "She enjoys playing the piano.", "They visited the museum yesterday.", "My favorite fruit is strawberries.",
    "We are planning a picnic.", "He can speak three languages.", "The book is on the table.", "Do you like to swim in the sea?",
    "I need to call my friend.", "The birds are singing in the trees.", "She is going to a concert tonight.", "I want to learn how to dance.",
    "The car is parked in the garage.", "They have a cute puppy.", "I am cooking dinner right now.", "The computer is on the desk.",
    "He plays soccer on weekends.", "I like to watch movies at home.", "She has a green umbrella."
]

tranlated_texts_list = []

for text in en_texts:
    text = remove_contractions(text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ' '.join(word.lower() for word in text.split())
    tranlated_texts_list.append(translate(input_text=text))



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  decoder_input_data = np.zeros(shape=(1, max_tokens), dtype=np.int)




In [33]:
translator = Translator(to_lang="tr")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

text_rouge_df = pd.DataFrame(columns=['text', 'real_translation', 'model_translation', 'rouge_socore'])

for en_text, trans_text in zip(en_texts, tranlated_texts_list):

  real_translate = translator.translate(en_text)

  translated_text_by_model = ' '.join(trans_text.split()[:-1])

  scores = scorer.score(real_translate, translated_text_by_model)

  text_rouge_df.loc[len(text_rouge_df)] = [en_text, real_translate, translated_text_by_model, scores['rougeL'][1]]


In [34]:
text_rouge_df

Unnamed: 0,text,real_translation,model_translation,rouge_socore
0,How are you today?,Bugün nasılsın?,nasıl bir günün değilsin,0.4
1,Which road leads to the airport?,Hangi yol havaalanına gider?,yol hangi yol yolu ile yol alanına gider,0.8
2,Where did tom go,Tom nereye gitti?,nereye tom gitti,0.666667
3,Drinking water is important,İçme suyu önemlidir,su içmek su içmek önemlidir,0.5
4,I like to do sports,Spor yapmayı severim,spor yapmayı severim keşke,1.0
5,How was your weekend?,Hafta sonun nasıl geçti?,hafta sonun nasıl bir hafta geçti,1.0
6,They take the bus to work,İşe otobüsle giderler,otobüse otobüsle giden otobüse giderler,0.6
7,His dog barks loudly,Köpeği yüksek sesle havlıyor,köpeği yüksek sesle müzik kullanarak havladı,0.75
8,We always eat dinner together,Akşam yemeğini hep birlikte yeriz,birlikte her zaman akşam yemeği yemek yiyoruz,0.428571
9,I like to read books.,Kitap okumayı seviyorum.,kitaplar okumayı kitaplar severim,0.333333


In [35]:
print('mean', np.mean(text_rouge_df['rouge_socore']))
print('std', np.std(text_rouge_df['rouge_socore']))
print('max', np.max(text_rouge_df['rouge_socore']))
print('min', np.mean(text_rouge_df['rouge_socore']))

mean 0.6459912536443149
std 0.25526077506545636
max 1.0
min 0.6459912536443149
