# Machine Translation From Arabic to English Using Spark NLP

This notebook contains code of our implementation of SPARK NLP and PYSPARK for machine translation. The input language is Arabic whereas the target language is English.

## Colab Setup

In [1]:
# installing libraries for NLP
! pip install -q pyspark==3.3.0 spark-nlp==4.2.8

! pip install --upgrade -q spark-nlp-display

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m453.8/453.8 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h

## Start the Spark session

Import dependencies and relevant modules

In [2]:
import json
import pandas as pd
import numpy as np

import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [5]:
spark_instance = sparknlp.start()

print("Spark NLP ", sparknlp.version())
print("Apache Spark :", spark_instance.version)

spark_instance

Spark NLP  4.2.8
Apache Spark : 3.3.0


## A sample text in Arabic for demo - This sentence will be translated to English

In [6]:
text = """اليوم سنناقش الطقس في دبي"""
#testing w ar
#This senstce is lets discuss the weather in dubai

## Define NLP pipeline

In [7]:
Assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

#this adds a deep learning model that is used to detect edges or boundaries of
#a sentence (it makes it more accurate)
DL_model = SentenceDetectorDLModel()\
  .pretrained("sentence_detector_dl", "xx")\
  .setInputCols(["document"])\
  .setOutputCol("sentences")
#loading transformer
marian_t = MarianTransformer.pretrained("opus_mt_ar_en", "xx")\
  .setInputCols(["sentences"])\
  .setOutputCol("translation")
#initalizing the pipeline
pipeline_init = Pipeline(
    stages=[
        Assembler,
        DL_model,
        marian_t
        ])

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
opus_mt_ar_en download started this may take some time.
Approximate size to download 390.7 MB
[OK!]


## Run the pipeline To start translation

In [8]:
df = spark_instance.createDataFrame([[""]]).toDF('text')
model = pipeline_init.fit(df)
lmodel = LightPipeline(model) #this is light pipeline bc it makes it run
#more effectively
res = lmodel.fullAnnotate(text)


Before _validateStagesInputCols


## Results for Terminal

In [9]:
print ('Original:', text, '\n\n')

print ('Translated:\n')
for sentence in res[0]['translation']:
  print (sentence.result)

Original: اليوم سنناقش الطقس في دبي 


Translated:

Today we're discussing the weather in Dubai.


# Deploying the Spark Based Model using Flask and ngrok

In [10]:
!pip install Flask



In [11]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.1.6-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.1.6


In [12]:
!pip install Flask pyngrok
!pip install -q pyspark==3.3.0 spark-nlp==4.2.8
!pip install --upgrade -q spark-nlp-display



In [13]:
import json
import pandas as pd
import numpy as np
import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import DocumentAssembler, SentenceDetectorDLModel, MarianTransformer
from sparknlp.base import LightPipeline
from flask import Flask, request, render_template_string

In [14]:
# spark instance
spark = sparknlp.start()

# initializing Pipeline
def get_pipeline():
    documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
    sentencerDL = SentenceDetectorDLModel().pretrained("sentence_detector_dl", "xx").setInputCols(["document"]).setOutputCol("sentences")
    marian = MarianTransformer.pretrained("opus_mt_ar_en", "xx").setInputCols(["sentences"]).setOutputCol("translation")

    nlp_pipeline = Pipeline(stages=[documentAssembler, sentencerDL, marian])
    empty_df = spark.createDataFrame([[""]]).toDF('text')
    pipeline_model = nlp_pipeline.fit(empty_df)
    lmodel = LightPipeline(pipeline_model)
    return lmodel

# this will load the above pipeline to do MT ar ==> eng
pipeline = get_pipeline()

# Starting Flask app
app = Flask(__name__)

# Colab cannot take HTML files as is so declare as a string and then use
template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Machine Translation</title>
</head>
<body>
    <h1>Machine Translation</h1>
    <form method="POST" action="/translate">
        <label for="text">Text to translate:</label><br>
        <textarea id="text" name="text" rows="4" cols="50"></textarea><br><br>

        <input type="submit" value="Translate">
    </form>
    {% if original_text %}
        <h2>Original Text:</h2>
        <p>{{ original_text }}</p>
        <h2>Translated Text:</h2>
        <p>{{ translated_text }}</p>
    {% endif %}
</body>
</html>
"""

@app.route('/')
def index():
    return render_template_string(template)

@app.route('/translate', methods=['POST'])
def translate():
    text = request.form['text']
    res = pipeline.fullAnnotate(text)

    translated_text = " ".join([sentence.result for sentence in res[0]['translation']])
    return render_template_string(template, original_text=text, translated_text=translated_text)

# Run Flask app
from pyngrok import ngrok

# Set  ngrok authtoken
NGROK_AUTH_TOKEN = "2NpCES1cJyv7FsZtPHzY7EqbHlZ_25u5HZPeGmjdwu1CeFLZ1"
#this token is from my uni account
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(5000)
print(f'Public URL: {public_url}')
app.run(port=5000)


sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
opus_mt_ar_en download started this may take some time.
Approximate size to download 390.7 MB
[OK!]




Public URL: NgrokTunnel: "https://2d72-34-80-14-48.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


In [18]:
#better HTML
# spark instance
spark = sparknlp.start()

# initializing Pipeline
def get_pipeline():
    documentAssembler = DocumentAssembler().setInputCol("text").setOutputCol("document")
    sentencerDL = SentenceDetectorDLModel().pretrained("sentence_detector_dl", "xx").setInputCols(["document"]).setOutputCol("sentences")
    marian = MarianTransformer.pretrained("opus_mt_ar_en", "xx").setInputCols(["sentences"]).setOutputCol("translation")

    nlp_pipeline = Pipeline(stages=[documentAssembler, sentencerDL, marian])
    empty_df = spark.createDataFrame([[""]]).toDF('text')
    pipeline_model = nlp_pipeline.fit(empty_df)
    lmodel = LightPipeline(pipeline_model)
    return lmodel

# this will load the above pipeline to do MT ar ==> eng
pipeline = get_pipeline()

# Starting Flask app
app = Flask(__name__)

# Colab cannot take HTML files as is so declare as a string and then use
template = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Machine Translation</title>
    <style>
        body {
            background-color: #f3e5f5;
            color: #4a148c;
            font-family: Arial, sans-serif;
            text-align: center;
            margin: 0;
            padding: 0;
        }

        h1 {
            font-size: 2.5em;
            margin: 20px 0;
            color: #4a148c;
        }

        p {
            font-size: 1.2em;
            margin: 10px 0;
            color: #6a1b9a;
            text-align: center;
        }

        form {
            display: inline-block;
            margin-top: 20px;
        }

        label {
            font-size: 1.2em;
        }

        textarea {
            width: 50%;
            height: 100px;
            border: 2px solid #4a148c;
            border-radius: 10px;
            padding: 10px;
            font-size: 1em;
            margin-top: 10px;
            margin-bottom: 20px;
        }

        input[type="submit"] {
            background-color: #7b1fa2;
            color: white;
            padding: 10px 20px;
            border: none;
            border-radius: 20px;
            font-size: 1.2em;
            cursor: pointer;
        }

        input[type="submit"]:hover {
            background-color: #6a1b9a;
        }

        h2 {
            font-size: 1.8em;
            margin: 20px 0 10px;
            color: #4a148c;
        }

        textarea, p, input[type="submit"], h2 {
            text-align: left;
            display: block;
            margin-left: auto;
            margin-right: auto;
        }
    </style>
</head>
<body>
    <h1>Machine Translation</h1>
    <p>Enter the text you want to translate from Arabic to English in the box below and press the translate button.</p>
    <form method="POST" action="/translate">
        <label for="text">Text to translate:</label><br>
        <textarea id="text" name="text" rows="4" cols="50"></textarea><br><br>
        <input type="submit" value="Translate">
    </form>
    {% if original_text %}
        <h2>Original Text:</h2>
        <p>{{ original_text }}</p>
        <h2>Translated Text:</h2>
        <p>{{ translated_text }}</p>
    {% endif %}
</body>
</html>




"""

@app.route('/')
def index():
    return render_template_string(template)

@app.route('/translate', methods=['POST'])
def translate():
    text = request.form['text']
    res = pipeline.fullAnnotate(text)

    translated_text = " ".join([sentence.result for sentence in res[0]['translation']])
    return render_template_string(template, original_text=text, translated_text=translated_text)

# Run Flask app
from pyngrok import ngrok

# Set  ngrok authtoken
NGROK_AUTH_TOKEN = "2NpCES1cJyv7FsZtPHzY7EqbHlZ_25u5HZPeGmjdwu1CeFLZ1"
#this token is from my uni account
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

public_url = ngrok.connect(5000)
print(f'Public URL: {public_url}')
app.run(port=5000)


sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
opus_mt_ar_en download started this may take some time.
Approximate size to download 390.7 MB
[OK!]




Public URL: NgrokTunnel: "https://e15d-34-80-14-48.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [11/Jun/2024 17:47:55] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/Jun/2024 17:47:56] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


Before _validateStagesInputCols


INFO:werkzeug:127.0.0.1 - - [11/Jun/2024 17:48:25] "POST /translate HTTP/1.1" 200 -


#BLEU Scores

In [13]:
!pip install nltk



In [15]:
import nltk

In [16]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [17]:
import json
import pandas as pd
import numpy as np
import sparknlp
import pyspark.sql.functions as F

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import DocumentAssembler, SentenceDetectorDLModel, MarianTransformer
from sparknlp.base import LightPipeline
from flask import Flask, request, render_template_string

In [19]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK data
nltk.download('punkt')

def normalize_and_tokenize(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    return tokens

# testing on same sample as before
text = "اليوم سنناقش الطقس في دبي"

# same components as before for setup
Assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

DL_model = SentenceDetectorDLModel()\
  .pretrained("sentence_detector_dl", "xx")\
  .setInputCols(["document"])\
  .setOutputCol("sentences")

marian_t = MarianTransformer.pretrained("opus_mt_ar_en", "xx")\
  .setInputCols(["sentences"])\
  .setOutputCol("translation")

nlp_pipeline = Pipeline(stages=[Assembler, DL_model, marian_t])
empty_df = spark.createDataFrame([[""]]).toDF('text')
pipeline_model = nlp_pipeline.fit(empty_df)
lmodel = LightPipeline(pipeline_model)

# Translate the text
res = lmodel.fullAnnotate(text)
translated_text = " ".join([sentence.result for sentence in res[0]['translation']])
print('Original:', text, '\n\n')
print('Translated:', translated_text, '\n')

# Reference translation (for demonstration purposes)
reference_translation = ["Today we will discuss the weather in Dubai"]

# Normalize and tokenize
reference_tokenized = [normalize_and_tokenize(ref) for ref in reference_translation][0]  # Flatten list
translated_tokenized = normalize_and_tokenize(translated_text)

# Calculate BLEU score
individual_bleu_score = sentence_bleu([reference_tokenized], translated_tokenized)
cumulative_bleu_score = corpus_bleu([[reference_tokenized]], [translated_tokenized])

# Print BLEU scores
print(f"Individual BLEU score: {individual_bleu_score}")
print(f"Cumulative BLEU score: {cumulative_bleu_score}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
opus_mt_ar_en download started this may take some time.
Approximate size to download 390.7 MB
[OK!]
Before _validateStagesInputCols
Original: اليوم سنناقش الطقس في دبي 


Translated: Today we're discussing the weather in Dubai. 

Individual BLEU score: 0.3768499164492419
Cumulative BLEU score: 0.3768499164492419


BLEU score ==> closer value to 1 the better our score = 0.4