<a href="https://colab.research.google.com/github/chriss006/2024_Advanced_DeepLearning/blob/main/RNN_based_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **RNN-based-Seq2Seq for FRA-ENG Translation**

In [2]:
import os
import re
import shutil
import zipfile

import numpy as np
import pandas as pd
import tensorflow as tf
import unicodedata
import urllib3
import requests
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [3]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}

def download_zip(url, output_path):
  response = requests.get(url, headers=headers, stream=True)
  if response.status_code == 200:
    with open(output_path, 'wb') as file:
      for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)
      print(f'Zip file download to {output_path}')
  else:
    print(f"Failed to download the file. Status code: {response.status_code}")

url = 'http://www.manythings.org/anki/fra-eng.zip'
output_path = 'fra-eng.zip'
download_zip(url, output_path)

path = os.getcwd()
zipfilename = os.path.join(path, output_path)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
  zip_ref.extractall(path)

Zip file download to fra-eng.zip


In [7]:
num_samples = 33000

def to_ascii(s):
  # 프랑스어 악센트(accent) 삭제
  # 예시 : 'déjà diné' -> deja dine
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

def preprocess_sentence(sent):
  # 악센트 제거 함수 호출
  sent = to_ascii(sent.lower())

  # 단어와 구두점 사이에 공백 추가.
  # ex) "I am a student." => "I am a student ."
  sent = re.sub(r"([?.!,¿])", r" \1", sent)

  # (a-z, A-Z, ".", "?", "!", ",") 이들을 제외하고는 전부 공백으로 변환.
  sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)

  # 다수 개의 공백을 하나의 공백으로 치환
  sent = re.sub(r"\s+", " ", sent)
  return sent

en_sent = u'Have you had dinner?'
fr_sent = u'Avez-vous déjà diné?'

print('전처리 전 영어 문장: ', en_sent)
print('전처리 후 영어 문장: ', preprocess_sentence(en_sent))
print('전처리 전 프랑스어 문장: ', fr_sent)
print('전처리 후 프랑스어 문장: ', preprocess_sentence(fr_sent))

전처리 전 영어 문장:  Have you had dinner?
전처리 후 영어 문장:  have you had dinner ?
전처리 전 프랑스어 문장:  Avez-vous déjà diné?
전처리 후 프랑스어 문장:  avez vous deja dine ?
