<a href="https://colab.research.google.com/github/gittymarina/merogit/blob/master/LANGUAGE_TRANSLATION_USING_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LOADING OUR DATASET

In [None]:
import pandas as pd
df=pd.read_csv("eng_-french.csv")
df

In [None]:
df.head()

DATA PRE-PROCESSING

In [None]:
import re
import string
from unicodedata import normalize

In [None]:
def preprocess_text(text):
    # normalize unicode characters
    text = normalize('NFD',text).encode("ascii","ignore")
    text = text.decode("UTF-8")
    # convert to string
    text = str(text)
    # convert to lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans('','',string.punctuation))
    # remove non-printable chars from text
    re_print = re.compile('[^%s]'% re.escape(string.printable))
    text = re_print.sub('',text)
    # remove numbers
    text = re.sub(r'[\d]+','',text)
    # remove multiple spaces
    text = ' '.join(text.split())
    return text


CHECKING OUR DATASET AFTER PRE-PROCESSING THE STRING FROM THE TEXT

In [None]:
df_copy = df.copy()
df_copy['English words/sentences'] = df_copy.loc[:,'English words/sentences'].apply(
    lambda x:preprocess_text(x))
df_copy['French words/sentences'] = df_copy.loc[:,'French words/sentences'].apply(
    lambda x:preprocess_text(x))
df_copy.head()

TOKENISING OUR DATA

In [None]:
!pip install tensorflow

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def tokenize(text):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    return tokenizer

In [None]:
# English tokenizer

eng_tokenizer = tokenize(df_copy['English words/sentences'])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

# French tokenizer
fr_tokenizer = tokenize(df_copy['French words/sentences'])
fr_vocab_size = len(fr_tokenizer.word_index) + 1

# Sequences
seq_eng = eng_tokenizer.texts_to_sequences(df_copy['English words/sentences'])
seq_fr = fr_tokenizer.texts_to_sequences(df_copy['French words/sentences'])

# Padding
max_length = max([len(seq) for seq in seq_eng + seq_fr])
seq_eng_final = pad_sequences(seq_eng,maxlen=max_length,padding="post")
seq_fr_final = pad_sequences(seq_fr,maxlen=max_length,padding="post")

print("English Vocab Size:",eng_vocab_size)
print("French Vocab Size:",fr_vocab_size)
print("Maximum length of sequences:",max_length)

TRAIN AND TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(seq_eng_final,
                                                 seq_fr_final,
                                                 test_size=0.2,
                                                 shuffle=True,
                                                 random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

MODEL BUILDING - SEQUENTIAL

In [None]:
from keras.models import Model, Sequential
from keras.layers import TimeDistributed, RepeatVector,Bidirectional
from keras.layers import Input,LSTM, Dense, Embedding, Attention
vector_length = 100
model = Sequential()
model.add(Embedding(input_dim=eng_vocab_size,output_dim=vector_length,input_length=max_length,mask_zero=True))
model.add(Bidirectional(LSTM(256)))
model.add(RepeatVector(max_length))
model.add(LSTM(256,return_sequences=True))
model.add(TimeDistributed(Dense(fr_vocab_size,activation="softmax")))
print(model.summary())

MODEL TRAINING

In [None]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=['accuracy'])
model.fit(X_train,y_train,epochs=1,validation_data=(X_test,y_test))

PREDICTIONS

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
def translation(input_sentence):
    input_sentence = preprocess_text(str(input_sentence))
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])
    input_seq_final = pad_sequences(input_seq,maxlen=max_length,padding="post")
    prediction = model.predict([input_seq_final])
    output_translation = np.argmax(prediction,axis=-1)

    output_sentence = []
    for i in output_translation[0]:
        if i in fr_tokenizer.index_word:
            output_sentence.append(fr_tokenizer.index_word[i])
        else:
            output_sentence.append(' ')
    return ' '.join(output_sentence)
input_sentence= "Where are you going?"
translated = translation(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated}")