In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the dataset
train = pd.read_csv('/content/drive/MyDrive/first_10k_rows.csv')
train_ascii = pd.read_csv('/content/drive/MyDrive/first_10k_rows_ascii.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

train_ascii['Sentence'] = train_ascii['Sentence'].str.lower()
test_data['Sentence'] = test_data['Sentence'].str.lower()
merged_data = pd.merge(train_ascii, train, on='ID')

In [None]:
# Function to tokenize each letter in a sentence
def letter_tokenization(sentence):
    return list(sentence)

# Apply letter tokenization to each sentence in both columns
merged_data['Tokenized_x'] = merged_data['Sentence_x'].apply(letter_tokenization)
merged_data['Tokenized_y'] = merged_data['Sentence_y'].apply(letter_tokenization)
test_data['Tokenized'] = test_data['Sentence'].apply(letter_tokenization)

In [None]:
import numpy as np

# Create vocabulary
char_to_index = {}
index_to_char = {}

# Add special tokens
char_to_index['<PAD>'] = 0
index_to_char[0] = '<PAD>'
char_to_index['<UNK>'] = 1
index_to_char[1] = '<UNK>'
char_to_index['<EOS>'] = 2
index_to_char[2] = '<EOS>'

# Function to add words to vocabulary
def add_to_vocab(chars):
    for char in chars:
        if char not in char_to_index:
            char_to_index[char] = len(char_to_index)
            index_to_char[len(char_to_index) - 1] = char

# Create vocabulary from tokenized input and labels
merged_data['Tokenized_x'].apply(add_to_vocab)
merged_data['Tokenized_y'].apply(add_to_vocab)
test_data['Tokenized'].apply(add_to_vocab)

# Add <EOS> token to the end of each sentence
merged_data['Tokenized_x'] = merged_data['Tokenized_x'].apply(lambda x: x + ['<EOS>'])
merged_data['Tokenized_y'] = merged_data['Tokenized_y'].apply(lambda x: x + ['<EOS>'])
test_data['Tokenized'] = test_data['Tokenized'].apply(lambda x: x + ['<EOS>'])

In [None]:
# Convert tokens to indices
def tokens_to_indices(tokens):
    return [char_to_index[char] for char in tokens]

# Apply tokenization and indexing to the DataFrame
merged_data['Indexed_x'] = merged_data['Tokenized_x'].apply(tokens_to_indices)
merged_data['Indexed_y'] = merged_data['Tokenized_y'].apply(tokens_to_indices)

In [None]:
merged_data.head()

Unnamed: 0,ID,Sentence_x,Sentence_y,Tokenized_x,Tokenized_y,Indexed_x,Indexed_y
0,0,"sinif , havuz ve acik deniz calismalariyla , t...","sınıf , havuz ve açık deniz çalışmalarıyla , t...","[s, i, n, i, f, , ,, , h, a, v, u, z, , v, ...","[s, ı, n, ı, f, , ,, , h, a, v, u, z, , v, ...","[3, 4, 5, 4, 6, 7, 8, 7, 9, 10, 11, 12, 13, 7,...","[3, 103, 5, 103, 6, 7, 8, 7, 9, 10, 11, 12, 13..."
1,1,"bu standart , sualtinda kendini rahat hisseden...","bu standart , sualtında kendini rahat hisseden...","[b, u, , s, t, a, n, d, a, r, t, , ,, , s, ...","[b, u, , s, t, a, n, d, a, r, t, , ,, , s, ...","[24, 12, 7, 3, 22, 10, 5, 17, 10, 20, 22, 7, 8...","[24, 12, 7, 3, 22, 10, 5, 17, 10, 20, 22, 7, 8..."
2,2,"yapilan arastirmalar , ogrencilerin mevcut dal...","yapılan araştırmalar , öğrencilerin mevcut dal...","[y, a, p, i, l, a, n, , a, r, a, s, t, i, r, ...","[y, a, p, ı, l, a, n, , a, r, a, ş, t, ı, r, ...","[21, 10, 27, 4, 18, 10, 5, 7, 10, 20, 10, 3, 2...","[21, 10, 27, 103, 18, 10, 5, 7, 10, 20, 10, 10..."
3,3,"pdic ogrencilerinde , psikolojik egitim ve yet...","pdıc öğrencilerinde , psikolojik eğitim ve yet...","[p, d, i, c, , o, g, r, e, n, c, i, l, e, r, ...","[p, d, ı, c, , ö, ğ, r, e, n, c, i, l, e, r, ...","[27, 17, 4, 15, 7, 25, 23, 20, 14, 5, 15, 4, 1...","[27, 17, 103, 15, 7, 108, 107, 20, 14, 5, 15, ..."
4,4,"pdic egitiminin sagladigi guven ve rahatlik , ...","pdıc eğitiminin sağladığı güven ve rahatlık , ...","[p, d, i, c, , e, g, i, t, i, m, i, n, i, n, ...","[p, d, ı, c, , e, ğ, i, t, i, m, i, n, i, n, ...","[27, 17, 4, 15, 7, 14, 23, 4, 22, 4, 19, 4, 5,...","[27, 17, 103, 15, 7, 14, 107, 4, 22, 4, 19, 4,..."


# MODEL
with first 10k rows

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, GRU, Dropout, Activation

# Model parameters
vocab_size = len(char_to_index)
embedding_dim = 100
hidden_units = 128
dropout_rate = 0.2

# Define model architecture
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    Bidirectional(GRU(units=hidden_units, return_sequences=True)),
    Dropout(dropout_rate),
    Dense(units=vocab_size),
    Activation('softmax')
])

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to a fixed length
max_length = 1200  # train ~ 1800, test ~ 1100
padded_input = pad_sequences(merged_data['Indexed_x'], maxlen=max_length, padding='post')
padded_label = pad_sequences(merged_data['Indexed_y'], maxlen=max_length, padding='post')

# Convert to numpy arrays
padded_input = np.array(padded_input)
padded_label = np.array(padded_label)

In [None]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(padded_input, padded_label, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Predictions

In [None]:
# Make predictions on a subset of the training set (first 4 examples)
train_predictions = model.predict(padded_input[0:4])
# Display examples
for idx in range(4):
    print("Example", idx+1)
    print("Input:", ''.join([index_to_char[i] for i in padded_input[idx] if i != 0]))  # Remove padding
    print("Label:", ''.join([index_to_char[i] for i in padded_label[idx] if i != 0]))  # Remove padding
    # Get predicted indices for the current example
    predicted_indices = train_predictions[idx].argmax(axis=1)
    # Remove padding and stop at <EOS> token
    predicted_sentence = []
    for i in predicted_indices:
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    print("Prediction:", ''.join(predicted_sentence))
    print()

Example 1
Input: sinif , havuz ve acik deniz calismalariyla , tum dunyada gecerli , basarili bir standart olusturmustur . <EOS>
Label: sınıf , havuz ve açık deniz çalışmalarıyla , tüm dünyada geçerli , başarılı bir standart oluşturmuştur . <EOS>
Prediction: sınif , havuz ve açık deniz çalışmalarıyla , tüm dünyada geçerli , başarılı bir standart oluşturmuştur . 

Example 2
Input: bu standart , sualtinda kendini rahat hisseden herkesin , sportif dalici olabilecegini , bunun icin fiziki guc ve cok iyi yuzme bilmenin sart olmadigini savunur . <EOS>
Label: bu standart , sualtında kendini rahat hisseden herkesin , sportif dalıcı olabileceğini , bunun için fiziki güç ve cok iyi yüzme bilmenin şart olmadığını savunur . <EOS>
Prediction: bu standart , şualtında kendini rahat hisseden herkesin , sportif dalicı olabileceğini , bunun için fiziki güç ve çok iyi yüzme bilmenin sart olmadığını savunur . 

Example 3
Input: yapilan arastirmalar , ogrencilerin mevcut dalis kurslarini tamamladiktan sonra

In [None]:
# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

test_data['Sentence'] = test_data['Sentence'].str.lower()

# Tokenize test sentences
test_data['tokenized_input'] = test_data['Sentence'].apply(letter_tokenization)

test_data['tokenized_input'] = test_data['tokenized_input'].apply(lambda x: x + ['<EOS>'])

# Convert test sentences to indices
test_data['indexed_input'] = test_data['tokenized_input'].apply(tokens_to_indices)

# Pad test sequences
padded_test_input = pad_sequences(test_data['indexed_input'], maxlen=max_length, padding='post')

# Predict labels for test data
test_predictions = model.predict(padded_test_input)

# Convert predicted indices to sentences
predicted_sentences = []
for prediction in test_predictions:
    predicted_sentence = []
    for i in prediction.argmax(axis=1):
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    predicted_sentences.append(''.join(predicted_sentence))

# Add predicted sentences to test_data
test_data['Predicted_Sentence'] = predicted_sentences



In [None]:
test_data.head(35)

Unnamed: 0,ID,Sentence,tokenized_input,indexed_input,Predicted_Sentence
0,0,tr ekonomi ve politika haberleri turkiye nin ...,"[ , t, r, , e, k, o, n, o, m, i, , v, e, , ...","[7, 22, 20, 7, 14, 16, 25, 5, 25, 19, 4, 7, 11...",tr ekonomi ve politika haberleri türkiye nin ...
1,1,uye girisi,"[ , u, y, e, , g, i, r, i, s, i, , <EOS>]","[7, 12, 21, 14, 7, 23, 4, 20, 4, 3, 4, 7, 2]",üye girişi
2,2,son guncelleme 12:12,"[ , s, o, n, , g, u, n, c, e, l, l, e, m, e, ...","[7, 3, 25, 5, 7, 23, 12, 5, 15, 14, 18, 18, 14...",son güncelleme 12:12
3,3,imrali mit gorusmesi ihtiyac duyuldukca oluyor,"[ , i, m, r, a, l, i, , m, i, t, , g, o, r, ...","[7, 4, 19, 20, 10, 18, 4, 7, 19, 4, 22, 7, 23,...",imrali mit görüşmesi ihtiyaç duyuldukça oluyor
4,4,suriye deki silahli selefi muhalifler yeni ku...,"[ , s, u, r, i, y, e, , d, e, k, i, , s, i, ...","[7, 3, 12, 20, 4, 21, 14, 7, 17, 14, 16, 4, 7,...",süriye deki silahli selefi muhalifler yeni ku...
5,5,ancak olum haberleri savastan cok tek tarafli...,"[ , a, n, c, a, k, , o, l, u, m, , h, a, b, ...","[7, 10, 5, 15, 10, 16, 7, 25, 18, 12, 19, 7, 9...",ancak ölum haberleri savastan çok tek taraflı...
6,6,israil in 4 uncu gunune giren gazze saldirila...,"[ , i, s, r, a, i, l, , i, n, , 4, , u, n, ...","[7, 4, 3, 20, 10, 4, 18, 7, 4, 5, 7, 31, 7, 12...",israil in 4 üncü gününe giren gazze saldırıla...
7,7,serbes: memecan sen mizahci misin,"[ , s, e, r, b, e, s, :, , m, e, m, e, c, a, ...","[7, 3, 14, 20, 24, 14, 3, 49, 7, 19, 14, 19, 1...",serbes: memecan sen mizahci misin
8,8,muslum gurses yogun bakimda,"[ , m, u, s, l, u, m, , g, u, r, s, e, s, , ...","[7, 19, 12, 3, 18, 12, 19, 7, 23, 12, 20, 3, 1...",muşlüm gürses yoğun bakımda
9,9,takip et: wwwradikalcomtr,"[ , t, a, k, i, p, , e, t, :, , w, w, w, r, ...","[7, 22, 10, 16, 4, 27, 7, 14, 22, 49, 7, 54, 5...",takip et: wwwradikalcomtr


In [None]:
import numpy as np

# Define the sentence
sentence = 'cekoslavakyalilastiramadiklarimizdansiniz'

# Tokenize the sentence
tokenized_input = letter_tokenization(sentence.lower())

# Add '<EOS>' token
tokenized_input.append('<EOS>')

# Convert tokens to indices
indexed_input = tokens_to_indices(tokenized_input)

# Pad sequence
padded_input = pad_sequences([indexed_input], maxlen=max_length, padding='post')

# Predict label
prediction = model.predict(padded_input)

# Convert predicted indices to sentence
predicted_sentence = []
for i in prediction[0].argmax(axis=1):
    if i == 0:  # Stop at padding
        break
    if index_to_char[i] == '<EOS>':  # Stop at <EOS>
        break
    predicted_sentence.append(index_to_char[i])

predicted_sentence = ''.join(predicted_sentence)
print("Predicted Sentence:", predicted_sentence)

Predicted Sentence: çekoslavakyalılaştıramadıklarımızdansınız


In [None]:
import numpy as np

# Define the sentence
sentence = 'butun ınsanlar hur , haysiyet ve haklar bakimindan esit dogarlar , akil ve vicdana sahiptirler ve birbirlerine karsi kardeslik zihniyeti ile hareket etmelidirler .'

# Tokenize the sentence
tokenized_input = letter_tokenization(sentence.lower())

# Add '<EOS>' token
tokenized_input.append('<EOS>')

# Convert tokens to indices
indexed_input = tokens_to_indices(tokenized_input)

# Pad sequence
padded_input = pad_sequences([indexed_input], maxlen=max_length, padding='post')

# Predict label
prediction = model.predict(padded_input)

# Convert predicted indices to sentence
predicted_sentence = []
for i in prediction[0].argmax(axis=1):
    if i == 0:  # Stop at padding
        break
    if index_to_char[i] == '<EOS>':  # Stop at <EOS>
        break
    predicted_sentence.append(index_to_char[i])

predicted_sentence = ''.join(predicted_sentence)
print("Predicted Sentence:", predicted_sentence)

Predicted Sentence: bütün insanlar hur , haysiyet ve haklar bakımından esit doğarlar . akil ve vicdana sahiptirler ve birbirlerine karşı kardeşlik zihniyeti ile hareket etmelidirler .
