In [12]:
from googletrans import Translator
from bs4 import BeautifulSoup
import time
import os
import json

# Translated text JSON file
_TRANSLATED_TEXT = "./translated.json"
_PROGRESS_FILE = "./progress.json"
_WEBSITE_PATH = "./classcentral"
_OUTPUT_PATH = "./translated-classcentral"

# Global Data
translated = dict()
progress = dict()

def reset():
    os.remove(_PROGRESS_FILE)
    os.remove(_TRANSLATED_TEXT)
    
def reset_progress():
    os.remove(_PROGRESS_FILE)


def load_translated_text():
    if os.path.exists(_TRANSLATED_TEXT):
        with open(_TRANSLATED_TEXT, 'r') as fp:
            translated = json.load(fp)
            return translated
    else:
        print(f"{_TRANSLATED_TEXT} does not exist")
        return {}

def save_translated_text():
    with open(_TRANSLATED_TEXT, 'w') as f:
        json.dump(translated, f)

def load_progress():
    if os.path.exists(_PROGRESS_FILE):
        with open(_PROGRESS_FILE, 'r') as fp:
            translated = json.load(fp)
            return translated
    else:
        print(f"{_PROGRESS_FILE} does not exist")
        return {}
    
def save_progress():
    with open(_PROGRESS_FILE, 'w') as f:
        json.dump(progress, f)
    

In [13]:
def translate_to_hindi(text):
    translator = Translator()
    time.sleep(0.1) # delay before making new API call to google translate APIs to avoid any rate-limiting
    translation = translator.translate(text, dest='hi')
    return translation.text

def translate_text(soup):
    for tag in soup.find_all(text=True):
        if tag.parent.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a', 'li', 'span']:
            if tag.string.strip():
                if tag.string not in translated:
                    translated[tag.string] = translate_to_hindi(tag.string)
                tag.string.replace_with(translated[tag.string])
    return soup
    
def translate_placeholder(soup):
    for tag in soup.find_all(placeholder=True):
        if tag.parent.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a', 'li', 'span', 'input', 'form']:
            if tag.attrs['placeholder'].strip():
                if tag.attrs['placeholder'] not in translated:
                    translated[tag.attrs['placeholder']] = translate_to_hindi(tag.attrs['placeholder'])
                tag.attrs['placeholder'] = translated[tag.attrs['placeholder']]
        return soup

def translate_html_data(soup):
    soup = translate_text(soup)
    soup = translate_placeholder(soup)
    return soup

def translate_html_file(html_file, output_file):
    with open(html_file) as fp:
        soup = BeautifulSoup(fp, 'html.parser')
        soup = translate_html_data(soup)
        
    # Save the translated file
    with open(output_file, 'w') as fp:
        if soup:
            fp.write(soup.prettify())
        progress[html_file] = True
        save_progress()
        save_translated_text()


In [14]:
def list_html_files(directory):
    html_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.html'):
                html_files.append(os.path.join(root, file))
    return html_files

def list_files_to_translate(html_files, progress):
    not_processed = list()
    for file in html_files:
        if file not in progress:
            not_processed.append(file)
    return not_processed

def get_output_file_path(input_file_path, output_directory, input_directory):
    output_path = os.path.abspath(input_file_path)
    input_dir_path = os.path.abspath(input_directory)
    output_dir_path = os.path.abspath(output_directory)
    output_path = output_path.replace(input_dir_path, output_dir_path)
    output_directory_path = os.path.dirname(output_path)
    if not os.path.exists(output_directory_path):
        os.makedirs(output_directory_path)
    return output_path
    
    
def translate_files(html_files):
    for file_path in html_files:
        print(f"Translating {file_path}...", end= " ")
        output_file = get_output_file_path(file_path, _OUTPUT_PATH, _WEBSITE_PATH)
        translate_html_file(file_path, output_file)
        print("-> Done")

In [15]:
# reset_progress()

In [16]:
translated = load_translated_text()
progress = load_progress()

placeholders = list()

html_files = list_html_files(_WEBSITE_PATH)
html_files = list_files_to_translate(html_files, progress)

print(f"{len(html_files)} HTML file(s) needs to be translated")
translate_files(html_files)


368 HTML file(s) needs to be translated
Translating ./classcentral/classcentral_clone/www.classcentral.com/institutions.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/starting-this-month4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/providers.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subjects4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/collections.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/starting-this-month.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/signup.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/universities.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/move-item/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/collection/sustain

Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/personal-development.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/biology.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/machine-learning.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/entrepreneurship.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/red-team.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/combinatorics.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/network-security4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/esl.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/mechanical-engineering.html... -> Done
Translating ./classcentral/classcentral_clone/w

Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/algorithms-and-data-structures4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/risk-management.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/reading.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/blockchain4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/reverse-engineering.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/data-mining.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/self-improvement.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/project-management.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/subject/blockchain.html... -> Done
Translating ./classcentral/c

Translating ./classcentral/classcentral_clone/www.classcentral.com/university/mit.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/mit4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/columbia4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/iit-kharagpur.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/edinburgh.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/stanford.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/iitm4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/penn.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/university/duke4658.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/univer

Translating ./classcentral/classcentral_clone/www.classcentral.com/report/online-learning-deals/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/class-central-ddos-attack/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/most-popular-courses-2022/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/best-digital-art-courses/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/free-google-certifications/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/edx-top-courses/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/most-popular-march-2023/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/india-online-degrees/index.html... -> Done
Translating ./classcentral/classcentral_clone



-> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/mooc-based-masters-degree/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/most-cited-mooc-research/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/100-most-popular-online-courses-2021/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/futurelearn-expands-paywall/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/udemy-by-the-numbers/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/cs50-free-certificate/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/best-free-online-courses-2022/index.html... -> Done
Translating ./classcentral/classcentral_clone/www.classcentral.com/report/best-ocaml-courses/index.html... -> Done
Translating ./cl

In [29]:
import shutil
def copy_file(src_path, dst_path):
    shutil.copy2(src_path, dst_path)

In [30]:
def migrate_translated_files(source_dir, destination_dir):
    source_dir = os.path.abspath(source_dir)
    destination_dir = os.path.abspath(destination_dir)
    files = list_html_files(source_dir)
    for file in files:
        source_path = os.path.abspath(file)
        dest_path = source_path.replace(source_dir, destination_dir)
#         print(f"SourcePath: {source_path.replace('/Users/mohitbeniwal/aubrey/dev/explore/ankit/web-translator', '')}")
#         print(f"DestinationPath: {dest_path.replace('/Users/mohitbeniwal/aubrey/dev/explore/ankit/web-translator', '')}")
        copy_file(source_path, dest_path)

In [31]:
migrate_translated_files(_OUTPUT_PATH, _WEBSITE_PATH)