In [280]:
import requests
import urllib.parse
from bs4 import BeautifulSoup



def tahlilgaran_translate(word):
    url = f"http://tahlilgaran.org/TDictionary/WebApp/?q={word}"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        translation_div = soup.find('div', class_='p-fa')
        if translation_div:
            return translation_div.get_text(strip=True)
    return ''


class Token:
    def __init__(self, tkk):
        self.tkk = tkk

    def calculate_token(self, text):
        if self.tkk == "":
            self.tkk = "440498.1287591069"
        [first_seed, second_seed] = self.tkk.split(".")

        try:
            d = bytearray(text.encode('UTF-8'))
        except UnicodeDecodeError:
            d = bytearray(text)

        a = int(first_seed)
        for value in d:
            a += value
            a = self._work_token(a, "+-a^+6")
        a = self._work_token(a, "+-3^+b+-f")
        a ^= int(second_seed)
        if 0 > a:
            a = (a & 2147483647) + 2147483648
        a %= 1E6
        a = int(a)
        return str(a) + "." + str(a ^ int(first_seed))

    @staticmethod
    def _rshift(val, n):
        return val >> n if val >= 0 else (val + 0x100000000) >> n

    def _work_token(self, a, seed):
        for i in range(0, len(seed) - 2, 3):
            char = seed[i + 2]
            d = ord(char[0]) - 87 if char >= "a" else int(char)
            d = self._rshift(a, d) if seed[i + 1] == "+" else a << d
            a = a + d & 4294967295 if seed[i] == "+" else a ^ d
        return a


class GoogleTranslate:
    def __init__(self, target_language, proxy=None):
        self.http_host = 'translate.googleapis.com'
        self.proxy = proxy
        self.target_language = target_language
        self.result = ''

    def get_url(self, tl, qry, tk):
        url = f'https://{self.http_host}/translate_a/single?client=gtx&sl=en&tl={tl}&hl=en&dt=at&dt=bd&dt=ex&' \
              f'dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=sos&dt=ss&dt=t&ssel=0&tsel=0&kc=1&tk={tk}&q={qry}'
        return url

    def get_resp(self, url):
        base_headers = {'User-Agent': 'Mozilla/5.0'}
        session = requests.Session()
        session.headers = base_headers
        resp = session.get(url, timeout=5)
        return resp.json()
    def translate(self, word, tkk=''):
        tk = Token(tkk).calculate_token(word)
        parse_query = urllib.parse.quote_plus(word)
        url = self.get_url(self.target_language, parse_query, tk)

        response = self.get_resp(url)

        
        return format_result(response) + '\n' + tahlilgaran_translate(word)

def format_result(result):
    output = []
    output = [result[0][0][0], result[0][0][1], "---"]
    if result[1]:
        for wt in result[1]:
            output.append(f"{wt[3]}")  
            output.append(f"{wt[0]}") 
            output.append(', '.join(wt[1]))  
            output.append("")  

            if wt[2]:  
                for w in wt[2]:
                    line = f"{w[0]}: {', '.join(w[1])}" 
                    if len(w) == 4:  
                        line += f" {int(w[3]*100)}"
                    output.append(line)
            output.append("_"*23)  
    return '\n'.join(output)

translator = GoogleTranslate('fa')  
result = translator.translate('rounded scripts')


In [289]:
import os
import re
import pandas as pd
from glob import glob
from gtts import gTTS
import time

base_dir = '/home/ri/Desktop/TOEFL/TPO-NEO/Finished'

for TPO in os.listdir(base_dir):
    tpo_dir = os.path.join(base_dir, TPO)
    tpo = os.path.basename(tpo_dir)
    audio_dir = os.path.join('/home/ri/.local/share/Anki2/fc1/collection.media/', tpo) 
    if not os.path.exists(audio_dir):
        os.makedirs(audio_dir)

        # Find CSV files in the directory
        csv_files = glob(tpo_dir + '/*.csv')
        assert len(csv_files) == 1, f"Expected one CSV file in {tpo_dir}, but found {len(csv_files)}."
        csv_file = csv_files[0]
        
        df = pd.read_csv(csv_file, header=None)
        words = df.iloc[:, 3].tolist()  
        
        words_translate = {}
        words_audio = {}

        df['Translation'] = ""
        df['Audio_File'] = ""

        for idx, word in enumerate(words):
            sanitized_word = re.sub(r'[^\w\s-]', '', word).strip().replace(' ', '_')
            file_path = os.path.join(audio_dir, f"{sanitized_word}.mp3")
            
            # Generate audio for the word
            try:
                tts = gTTS(text=word, lang='en-us', tld='us', slow=False)
                tts.save(file_path)
                words_audio[word] = file_path

                # Add the audio file path to the DataFrame
                df.at[idx, 'Audio_File'] = f'[sound:{tpo}/{sanitized_word}]'

                # Attempt translation
                attempts = 0
                max_attempts = 5  # Set a limit for retrying
                while attempts < max_attempts:
                    try:
                        translator = GoogleTranslate('fa')  # Assuming this is a valid object
                        translated_text = translator.translate(word)
                        words_translate[word] = translated_text
                        
                        # Add the translation to the DataFrame
                        df.at[idx, 'Translation'] = translated_text
                        break  # If translation succeeds, exit the retry loop
                    except Exception as e:
                        attempts += 1
                        print(f"Error translating {word}, attempt {attempts}. Error: {e}")
                        if attempts < max_attempts:
                            time.sleep(3)  # Sleep before retrying
                        else:
                            print(f"Failed to translate {word} after {max_attempts} attempts.")
                            df.at[idx, 'Translation'] = ''
            except Exception as e:
                print(f"Error generating pronunciation for {word}: {e}")
                df.at[idx, 'Audio_File'] = ''

        # Save the DataFrame with new columns
        output_csv_file = os.path.join(tpo_dir, f"updated_{os.path.basename(csv_file)}")
        df.to_csv(output_csv_file, index=False, header=None)
        print(f"Saved updated CSV: {output_csv_file}")
        time.sleep(5)
        # Remove `break` if you want to process all directories, not just the first one
        # break


TPO40


In [293]:
df.to_csv(output_csv_file, index=False, header=None)

In [292]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,Translation,Audio_File
0,T28,R,P1,potable water,/ˈpoʊtəbəl ˈwɔːtər/ - noun,Water that is safe to drink,آب قابل شرب,Most of the world's potable water is accounted...,آب آشامیدنی\npotable water\n---\nزیست شناسی:اب...,[sound:TPO28/potable_water.mp3]
1,T28,R,P1,is accounted for by,/ɪz əˈkaʊntɪd fɔr baɪ/ - phrase,Is represented or made up by,توسط تشکیل شده است,Most of the world's potable water is accounted...,به حساب می آید\nis accounted for by\n---\n,[sound:TPO28/is_accounted_for_by.mp3]
2,T28,R,P1,overlying,/ˈoʊvərˌlaɪɪŋ/ - adj,Situated above,بالایی,The pressure of the overlying rock causes pore...,پوشاننده\noverlying\n---\n,[sound:TPO28/overlying.mp3]
3,T28,R,P1,aquifer,/ˈækwɪfər/ - noun,An underground layer of water-bearing permeabl...,آب‌زیرزمینی,Aquifers are vital for groundwater storage and...,آبخوان\naquifer\n---\nسفره اب، ابزا، ابخیز، سف...,[sound:TPO28/aquifer.mp3]
4,T28,R,P1,porosity,/pɔˈrɑsəti/ - noun,The percentage of volume in a material that is...,تخلخل,Porosity determines how much water a rock can ...,"تخلخل\nporosity\n---\nporosity\nnoun\nتخلخل, پ...",[sound:TPO28/porosity.mp3]
...,...,...,...,...,...,...,...,...,...,...
123,T28,L,L4,that was every bit as advanced as,/ðæt wəz ˈɛvri bɪt æz ædˈvænst æz/ - phrase,Was just as advanced as.,به همان اندازه پیشرفته,Remains of an ancient civilization that was ev...,که هر ذره به همان اندازه پیشرفته بود\nthat was...,[sound:TPO28/that_was_every_bit_as_advanced_as...
124,T28,L,L4,shifts its course,/ʃɪfts ɪts kɔrs/ - phrase,Changes its path or direction.,تغییر مسیر دادن,The Murgab river is the kind of river that shi...,مسیر خود را تغییر می دهد\nshifts its course\n-...,[sound:TPO28/shifts_its_course.mp3]
125,T28,L,L4,has taken its toll on,/həz ˈteɪkən ɪts toʊl ɑn/ - phrase,Has had a negative impact on.,تاثیر منفی گذاشته,Being exposed to the sun and wind has taken it...,عوارض خود را گرفته است\nhas taken its toll on\...,[sound:TPO28/has_taken_its_toll_on.mp3]
126,T28,L,L4,be a pity,/bi ə ˈpɪti/ - phrase,Be a cause for regret or disappointment.,حیف بودن,It would be a pity if the site disintegrates w...,حیف باشد\nbe a pity\n---\n,[sound:TPO28/be_a_pity.mp3]


In [291]:
df['Audio_File'] = df['Audio_File'].apply(lambda x: x.replace('pronunciations','TPO28'))


In [300]:
from gtts import gTTS
from IPython.display import Audio

# Generate the audio
tts = gTTS(text='water', lang='en-us', tld='us', slow=False)
tts = gTTS(text='water', lang='en', slow=False)

# Save the audio to a file
tts.save("water.mp3")

# Play the audio in the notebook
Audio("water.mp3", autoplay=True)


In [None]:
tts.