In [None]:
# Install required packages
!pip install requests beautifulsoup4 pandas pillow pytesseract googletrans==4.0.0-rc1 easyocr
!apt install tesseract-ocr tesseract-ocr-hin

# Imports
import requests, io, time
from bs4 import BeautifulSoup
from googletrans import Translator
import pandas as pd
import pytesseract
from PIL import Image
import easyocr

# Initialize translator & OCR reader
translator = Translator()
reader = easyocr.Reader(['hi'])

# Language map
lang_code_to_name = {
    'en': 'English', 'kn': 'Kannada',
    'ta': 'Tamil', 'te': 'Telugu'
}
target_languages = list(lang_code_to_name.keys())

# Extract Hindi from website
def extract_hindi_text_from_website(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        hindi_lines = []
        for tag in soup.find_all(['p', 'div', 'span']):
            text = tag.get_text(strip=True)
            if any('ऀ' <= c <= 'ॿ' for c in text):
                hindi_lines.append(' '.join(text.strip().split()))
        return hindi_lines
    except Exception as e:
        print(f"Website error: {e}")
        return []

# Extract Hindi from online image (EasyOCR + Tesseract)
def extract_hindi_from_online_image(img_url):
    try:
        response = requests.get(img_url)
        img_bytes = io.BytesIO(response.content)
        img_path = "/tmp/temp_img.jpg"
        with open(img_path, "wb") as f:
            f.write(img_bytes.getvalue())

        easy_text = ' '.join([t.strip() for t in reader.readtext(img_path, detail=0, paragraph=True)
                              if any('ऀ' <= c <= 'ॿ' for c in t)])
        tess_text = pytesseract.image_to_string(Image.open(img_path), lang='hin').strip()
        return [easy_text, tess_text]
    except Exception as e:
        print(f"Image error: {e}")
        return []

# Translate Hindi to multiple languages
def translate_to_languages(text, target_langs):
    translations = {}
    for lang in target_langs:
        try:
            result = translator.translate(text, src='hi', dest=lang)
            translations[lang] = result.text
        except Exception as e:
            translations[lang] = f"Error: {e}"
    return translations

# === INPUTS ===
website_url = "https://safetyculture.com/topics/sop/"  # Your Hindi text website URL

# === Collect Hindi Text ===
all_hindi_texts = extract_hindi_text_from_website(website_url)
for line in extract_hindi_from_online_image(image_url):
    if line and any('ऀ' <= c <= 'ॿ' for c in line):
        all_hindi_texts.append(line)

# Remove duplicates & short entries
all_hindi_texts = list(dict.fromkeys([t for t in all_hindi_texts if len(t) > 3]))

# === Translation ===
translation_data = []
start_time = time.time()

for text in all_hindi_texts:
    row = {'Hindi': text}
    translations = translate_to_languages(text, target_languages)
    for lang_code, translated in translations.items():
        row[f'Translation ({lang_code_to_name[lang_code]})'] = translated
    translation_data.append(row)

print(f"✅ Translation done in {time.time() - start_time:.2f} seconds")

# Save & Download CSV
df = pd.DataFrame(translation_data)
df.to_csv("hindi_translations.csv", index=False, encoding='utf-8-sig')
from google.colab import files
files.download("hindi_translations.csv")