In [2]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import csv

# Load the datasets
train = pd.read_csv('/content/drive/MyDrive/final_train.csv')
train_ascii = pd.read_csv('/content/drive/MyDrive/final_deascii.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')


train_ascii['Sentence'] = train_ascii['Sentence'].str.lower()
test_data['Sentence'] = test_data['Sentence'].str.lower()
merged_data = pd.merge(train_ascii, train, on='ID')

# Processing

In [4]:
# Function to tokenize each letter in a sentence
def letter_tokenization(sentence):
    return list(sentence)

# Apply letter tokenization to each sentence in both columns
merged_data['Tokenized_x'] = merged_data['Sentence_x'].apply(letter_tokenization)
merged_data['Tokenized_y'] = merged_data['Sentence_y'].apply(letter_tokenization)
test_data['Tokenized'] = test_data['Sentence'].apply(letter_tokenization)

In [5]:
import numpy as np

# Create vocabulary
char_to_index = {}
index_to_char = {}

# Add special tokens
char_to_index['<PAD>'] = 0
index_to_char[0] = '<PAD>'
char_to_index['<UNK>'] = 1
index_to_char[1] = '<UNK>'
char_to_index['<EOS>'] = 2
index_to_char[2] = '<EOS>'

# Function to add words to vocabulary
def add_to_vocab(chars):
    for char in chars:
        if char not in char_to_index:
            char_to_index[char] = len(char_to_index)
            index_to_char[len(char_to_index) - 1] = char

# Create vocabulary from tokenized input and labels
merged_data['Tokenized_x'].apply(add_to_vocab)
merged_data['Tokenized_y'].apply(add_to_vocab)
test_data['Tokenized'].apply(add_to_vocab)

# Add <EOS> token to the end of each sentence
merged_data['Tokenized_x'] = merged_data['Tokenized_x'].apply(lambda x: x + ['<EOS>'])
merged_data['Tokenized_y'] = merged_data['Tokenized_y'].apply(lambda x: x + ['<EOS>'])
test_data['Tokenized'] = test_data['Tokenized'].apply(lambda x: x + ['<EOS>'])

In [6]:
# Convert tokens to indices
def tokens_to_indices(tokens):
    return [char_to_index[char] for char in tokens]

# Apply tokenization and indexing to the DataFrame
merged_data['Indexed_x'] = merged_data['Tokenized_x'].apply(tokens_to_indices)
merged_data['Indexed_y'] = merged_data['Tokenized_y'].apply(tokens_to_indices)

# Training

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense

# Model parameters
vocab_size = len(char_to_index)
embedding_dim = 100
hidden_units = 64
dropout_rate = 0.2

# Define model architecture
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    Bidirectional(LSTM(units=hidden_units, return_sequences=True)),
    Dropout(dropout_rate),
    Dense(units=vocab_size, activation='softmax')
])

In [10]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to a fixed length
max_length = 1200  # train ~ 1800, test ~ 1100
padded_input = pad_sequences(merged_data['Indexed_x'], maxlen=max_length, padding='post')
padded_label = pad_sequences(merged_data['Indexed_y'], maxlen=max_length, padding='post')

# Convert to numpy arrays
padded_input = np.array(padded_input)
padded_label = np.array(padded_label)

In [11]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(padded_input, padded_label, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#Predictions

In [12]:
# Make predictions on a subset of the training set (first 2 examples)
train_predictions = model.predict(padded_input[0:4])
# Display examples
for idx in range(4):
    print("Example", idx+1)
    print("Input:", ' '.join([index_to_char[i] for i in padded_input[idx] if i != 0]))  # Remove padding
    print("Label:", ' '.join([index_to_char[i] for i in padded_label[idx] if i != 0]))  # Remove padding
    # Get predicted indices for the current example
    predicted_indices = train_predictions[idx].argmax(axis=1)
    # Remove padding and stop at <EOS> token
    predicted_sentence = []
    for i in predicted_indices:
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    print("Prediction:", ''.join(predicted_sentence))
    print()

Example 1
Input: e k o n o m i <EOS>
Label: e k o n o m i <EOS>
Prediction: Ekonomi

Example 2
Input: g i r i s i <EOS>
Label: g i r i ş i <EOS>
Prediction: Girisı

Example 3
Input: g u n c e l l e m e <EOS>
Label: g ü n c e l l e m e <EOS>
Prediction: güncelleme

Example 4
Input: m i t <EOS>
Label: m i t <EOS>
Prediction: Mit



In [15]:
# Load the test dataset
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

test_data['Sentence'] = test_data['Sentence'].str.lower()

# Tokenize test sentences
test_data['tokenized_input'] = test_data['Sentence'].apply(letter_tokenization)

test_data['tokenized_input'] = test_data['tokenized_input'].apply(lambda x: x + ['<EOS>'])

# Convert test sentences to indices
test_data['indexed_input'] = test_data['tokenized_input'].apply(tokens_to_indices)

# Pad test sequences
padded_test_input = pad_sequences(test_data['indexed_input'], maxlen=max_length, padding='post')

# Predict labels for test data
test_predictions = model.predict(padded_test_input)

# Convert predicted indices to sentences
predicted_sentences = []
for prediction in test_predictions:
    predicted_sentence = []
    for i in prediction.argmax(axis=1):
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    predicted_sentences.append(''.join(predicted_sentence))

# Add predicted sentences to test_data
test_data['Predicted_Sentence'] = predicted_sentences



In [16]:
test_data.tail(30)

Unnamed: 0,ID,Sentence,tokenized_input,indexed_input,Predicted_Sentence
1127,1127,tecavuz sucundan hapse giren suclular genellik...,"[t, e, c, a, v, u, z, , s, u, c, u, n, d, a, ...","[15, 3, 13, 16, 18, 12, 24, 31, 11, 12, 13, 12...",tecavuz sucundan hapse giren suçlular genellik...
1128,1128,tekirdagin burgulusu cok meshurdur,"[t, e, k, i, r, d, a, g, i, n, , b, u, r, g, ...","[15, 3, 4, 8, 10, 22, 16, 9, 8, 6, 31, 19, 12,...",tekirdağın burgulüşü çok meshurdur
1129,1129,tencereleri bulasik makinesine yerlestirirken ...,"[t, e, n, c, e, r, e, l, e, r, i, , b, u, l, ...","[15, 3, 6, 13, 3, 10, 3, 14, 3, 10, 8, 31, 19,...",tencereleri bulaşık makinesine yerleştirirken ...
1130,1130,toplanilan isle murekkep yapardik,"[t, o, p, l, a, n, i, l, a, n, , i, s, l, e, ...","[15, 5, 23, 14, 16, 6, 8, 14, 16, 6, 31, 8, 11...",toplanılan işle mürekkep yapardık
1131,1131,trendeki kiza asilmasi cok buyuk ayipti,"[t, r, e, n, d, e, k, i, , k, i, z, a, , a, ...","[15, 10, 3, 6, 22, 3, 4, 8, 31, 4, 8, 24, 16, ...",trendeki kıza aşılması çok büyük ayıptı
1132,1132,tulbent uzerine suzeni isinde buyuk bir ustali...,"[t, u, l, b, e, n, t, , u, z, e, r, i, n, e, ...","[15, 12, 14, 19, 3, 6, 15, 31, 12, 24, 3, 10, ...",tülbent üzerine süzeni isinde büyük bir ustalı...
1133,1133,turkiye ibisin anavatanindan biridir,"[t, u, r, k, i, y, e, , i, b, i, s, i, n, , ...","[15, 12, 10, 4, 8, 21, 3, 31, 8, 19, 8, 11, 8,...",türkiye ibisin anavatanından biridir
1134,1134,uykusunda bile fosurdatirdi,"[u, y, k, u, s, u, n, d, a, , b, i, l, e, , ...","[12, 21, 4, 12, 11, 12, 6, 22, 16, 31, 19, 8, ...",uykusunda bile fosurdatırdı
1135,1135,uzakdogudaki buruk gelenegi ulkeden ulkeye deg...,"[u, z, a, k, d, o, g, u, d, a, k, i, , b, u, ...","[12, 24, 16, 4, 22, 5, 9, 12, 22, 16, 4, 8, 31...",uzakdoğudaki büruk geleneği ülkeden ülkeye değ...
1136,1136,uzun zaman sonra bir araya gelip iki lafin bel...,"[u, z, u, n, , z, a, m, a, n, , s, o, n, r, ...","[12, 24, 12, 6, 31, 24, 16, 7, 16, 6, 31, 11, ...",uzun zaman sonra bir araya gelip iki lafin bel...
