In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import csv

# Load the datasets
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
asciified_data = pd.read_csv('/content/drive/MyDrive/train_ascii.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

# Merge the datasets based on ID
merged_data = pd.merge(asciified_data, train_data, on='ID')

# Processing

In [3]:
# Function to tokenize each letter in a sentence
def letter_tokenization(sentence):
    return list(sentence)

# Apply letter tokenization to each sentence in both columns
merged_data['Tokenized_x'] = merged_data['Sentence_x'].apply(letter_tokenization)
merged_data['Tokenized_y'] = merged_data['Sentence_y'].apply(letter_tokenization)
test_data['Tokenized'] = test_data['Sentence'].apply(letter_tokenization)

In [4]:
import numpy as np

# Create vocabulary
char_to_index = {}
index_to_char = {}

# Add special tokens
char_to_index['<PAD>'] = 0
index_to_char[0] = '<PAD>'
char_to_index['<UNK>'] = 1
index_to_char[1] = '<UNK>'
char_to_index['<EOS>'] = 2
index_to_char[2] = '<EOS>'

# Function to add words to vocabulary
def add_to_vocab(chars):
    for char in chars:
        if char not in char_to_index:
            char_to_index[char] = len(char_to_index)
            index_to_char[len(char_to_index) - 1] = char

# Create vocabulary from tokenized input and labels
merged_data['Tokenized_x'].apply(add_to_vocab)
merged_data['Tokenized_y'].apply(add_to_vocab)
test_data['Tokenized'].apply(add_to_vocab)

# Add <EOS> token to the end of each sentence
merged_data['Tokenized_x'] = merged_data['Tokenized_x'].apply(lambda x: x + ['<EOS>'])
merged_data['Tokenized_y'] = merged_data['Tokenized_y'].apply(lambda x: x + ['<EOS>'])
test_data['Tokenized'] = test_data['Tokenized'].apply(lambda x: x + ['<EOS>'])

In [5]:
# Convert tokens to indices
def tokens_to_indices(tokens):
    return [char_to_index[char] for char in tokens]

# Apply tokenization and indexing to the DataFrame
merged_data['Indexed_x'] = merged_data['Tokenized_x'].apply(tokens_to_indices)
merged_data['Indexed_y'] = merged_data['Tokenized_y'].apply(tokens_to_indices)

# Training

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout

# Model parameters
vocab_size = len(char_to_index)
embedding_dim = 100
hidden_units = 64

# Define model architecture
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True),
    Bidirectional(LSTM(units=hidden_units, return_sequences=True)),
    Dropout(0.2),
    LSTM(units=hidden_units, return_sequences=True),
    Dropout(0.2),
    Dense(units=vocab_size, activation='softmax')
])

In [9]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences to a fixed length
max_length = 1200  # train ~ 1800, test ~ 1100
padded_input = pad_sequences(merged_data['Indexed_x'], maxlen=max_length, padding='post')
padded_label = pad_sequences(merged_data['Indexed_y'], maxlen=max_length, padding='post')

# Convert to numpy arrays
padded_input = np.array(padded_input)
padded_label = np.array(padded_label)

In [11]:
# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(padded_input, padded_label, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


#Predictions

In [12]:
# Make predictions on a subset of the training set (first 2 examples)
train_predictions = model.predict(padded_input[0:4])
# Display examples
for idx in range(4):
    print("Example", idx+1)
    print("Input:", ' '.join([index_to_char[i] for i in padded_input[idx] if i != 0]))  # Remove padding
    print("Label:", ' '.join([index_to_char[i] for i in padded_label[idx] if i != 0]))  # Remove padding
    # Get predicted indices for the current example
    predicted_indices = train_predictions[idx].argmax(axis=1)
    # Remove padding and stop at <EOS> token
    predicted_sentence = []
    for i in predicted_indices:
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    print("Prediction:", ''.join(predicted_sentence))
    print()

Example 1
Input: s i n i f   ,   h a v u z   v e   a c i k   d e n i z   c a l i s m a l a r i y l a   ,   t u m   d u n y a d a   g e c e r l i   ,   b a s a r i l i   b i r   s t a n d a r t   o l u s t u r m u s t u r   .   <EOS>
Label: s ı n ı f   ,   h a v u z   v e   a ç ı k   d e n i z   ç a l ı ş m a l a r ı y l a   ,   t ü m   d ü n y a d a   g e ç e r l i   ,   b a ş a r ı l ı   b i r   s t a n d a r t   o l u ş t u r m u ş t u r   .   <EOS>
Prediction: sınif , havuz ve açık deniz çalışmalarıyla , tüm dünyada geçerli , başarılı bir standart oluşturmuştur . 

Example 2
Input: b u   s t a n d a r t   ,   s u a l t i n d a   k e n d i n i   r a h a t   h i s s e d e n   h e r k e s i n   ,   s p o r t i f   d a l i c i   o l a b i l e c e g i n i   ,   b u n u n   i c i n   f i z i k i   g u c   v e   c o k   i y i   y u z m e   b i l m e n i n   s a r t   o l m a d i g i n i   s a v u n u r   .   <EOS>
Label: b u   s t a n d a r t   ,   s u a l t ı n d a   k e n d i n i   r a h

In [14]:
# Load the test dataset
test_data = pd.read_csv('/content/drive/My Drive/test.csv')

test_data['Sentence'] = test_data['Sentence'].str.lower()

# Tokenize test sentences
test_data['tokenized_input'] = test_data['Sentence'].apply(letter_tokenization)

test_data['tokenized_input'] = test_data['tokenized_input'].apply(lambda x: x + ['<EOS>'])

# Convert test sentences to indices
test_data['indexed_input'] = test_data['tokenized_input'].apply(tokens_to_indices)

# Pad test sequences
padded_test_input = pad_sequences(test_data['indexed_input'], maxlen=max_length, padding='post')

# Predict labels for test data
test_predictions = model.predict(padded_test_input)

# Convert predicted indices to sentences
predicted_sentences = []
for prediction in test_predictions:
    predicted_sentence = []
    for i in prediction.argmax(axis=1):
        if i == 0:  # Stop at padding
            break
        if index_to_char[i] == '<EOS>':  # Stop at <EOS>
            break
        predicted_sentence.append(index_to_char[i])
    predicted_sentences.append(''.join(predicted_sentence))

# Add predicted sentences to test_data
test_data['Predicted_Sentence'] = predicted_sentences



In [16]:
test_data.head(15)

Unnamed: 0,ID,Sentence,tokenized_input,indexed_input,Predicted_Sentence
0,0,tr ekonomi ve politika haberleri turkiye nin ...,"[ , t, r, , e, k, o, n, o, m, i, , v, e, , ...","[7, 22, 20, 7, 14, 16, 25, 5, 25, 19, 4, 7, 11...",tr ekonomi ve politika haberleri türkiye nin ...
1,1,uye girisi,"[ , u, y, e, , g, i, r, i, s, i, , <EOS>]","[7, 12, 21, 14, 7, 23, 4, 20, 4, 3, 4, 7, 2]",üye girişi
2,2,son guncelleme 12:12,"[ , s, o, n, , g, u, n, c, e, l, l, e, m, e, ...","[7, 3, 25, 5, 7, 23, 12, 5, 15, 14, 18, 18, 14...",son güncelleme 12:12
3,3,imrali mit gorusmesi ihtiyac duyuldukca oluyor,"[ , i, m, r, a, l, i, , m, i, t, , g, o, r, ...","[7, 4, 19, 20, 10, 18, 4, 7, 19, 4, 22, 7, 23,...",imrali mit görüşmesi ihtiyaç duyuldukça oluyor
4,4,suriye deki silahli selefi muhalifler yeni ku...,"[ , s, u, r, i, y, e, , d, e, k, i, , s, i, ...","[7, 3, 12, 20, 4, 21, 14, 7, 17, 14, 16, 4, 7,...",suriye deki silahlı selefi muhalifler yeni ku...
5,5,ancak olum haberleri savastan cok tek tarafli...,"[ , a, n, c, a, k, , o, l, u, m, , h, a, b, ...","[7, 10, 5, 15, 10, 16, 7, 25, 18, 12, 19, 7, 9...",ancak ölüm haberleri savaştan çok tek taraflı...
6,6,israil in 4 uncu gunune giren gazze saldirila...,"[ , i, s, r, a, i, l, , i, n, , 4, , u, n, ...","[7, 4, 3, 20, 10, 4, 18, 7, 4, 5, 7, 31, 7, 12...",israil in 4 üncü gününe giren gazze saldırıla...
7,7,serbes: memecan sen mizahci misin,"[ , s, e, r, b, e, s, :, , m, e, m, e, c, a, ...","[7, 3, 14, 20, 24, 14, 3, 49, 7, 19, 14, 19, 1...",serbes: memecan sen mizahçı misın
8,8,muslum gurses yogun bakimda,"[ , m, u, s, l, u, m, , g, u, r, s, e, s, , ...","[7, 19, 12, 3, 18, 12, 19, 7, 23, 12, 20, 3, 1...",müslüm gürses yoğun bakımda
9,9,takip et: wwwradikalcomtr,"[ , t, a, k, i, p, , e, t, :, , w, w, w, r, ...","[7, 22, 10, 16, 4, 27, 7, 14, 22, 49, 7, 54, 5...",takip et: wwwradikalcomtr
