# Translate tables.json

In [15]:
import json

# Define a function to load JSON data from a file
def loadJson(filename):
    # Open the specified file in read mode, specifying utf-8 encoding
    with open(filename, 'r', encoding='utf-8') as file:
        # Load the JSON data from the file into a Python dictionary
        data = json.load(file)
    # Return the loaded data
    return data

In [16]:
def writeJsonToFile(data, filePath, indent=4):
    with open(filePath, "w", encoding="utf-8") as jsonFile:
        json.dump(data, jsonFile, ensure_ascii=False, indent=indent)

    print("JSON data written to", filePath)

In [17]:
import pandas as pd

def fetchSpecificColumnForDbId(df, specificDbId, columnName):
    # Filter rows based on the specific db_id
    filteredDf = df[df['db_id'] == specificDbId]

    # Extract the specified column from the filtered DataFrame
    column_data = filteredDf[columnName]

    # Convert the column data to a list
    column_list = column_data.tolist()

    return column_list

In [18]:
def createTableDict(database):
    dictWithTables = {}
    # Extract table names from the database schema (assuming they are stored in 'table_names')
    tablesNames = database['table_names_translated']
    
    # Iterate over the table names and assign each a unique index as the key in the dictionary
    for index, tableName in enumerate(tablesNames):
        dictWithTables[index] = tableName
    
    return dictWithTables

In [19]:
import re

def removeTextInParentheses(inputString):
    # Define a regex pattern to match text within parentheses
    pattern = r'\([^)]*\)'
    
    # Use re.sub() to replace the matched pattern with an empty string
    outputString = re.sub(pattern, '', inputString)
    
    return outputString

In [20]:
def capitalizeFirstLetter(word):
    # Check if the word is not empty
    if word:
        # Capitalize the first letter
        capitalizedWord = word[0].upper() + word[1:]
        return capitalizedWord
    else:
        return word
    
def removeUnnecessarySpaces(inputString):
    # Remove leading and trailing spaces
    inputString = inputString.strip()
    
    # Remove extra spaces between words
    inputString = " ".join(inputString.split())
    
    return inputString

def isAllUppercase(s):
    return s.isupper()

def makeLower(s):
    return s.lower()

def splitAndJoin(inputString):
    if inputString in ('*'):
        return inputString
    
    if isAllUppercase(inputString):
        inputString = makeLower(inputString)
    

    # Convert the input string to lowercase
    inputString = capitalizeFirstLetter(inputString)
    
    # Split the input string based on underscores
    splitStringsUnderscore = re.split(r'_', inputString)
    
    joinedString = ' '.join(splitStringsUnderscore)
    
    splitStringsCapital = re.findall('[Α-ΩΆ-Ώ][^Α-ΩΆ-Ώ]*', joinedString)

    if not splitStringsCapital:
        splitStringsCapital = splitStringsUnderscore
    
    # Join the split strings with spaces and convert to lowercase
    joinedString = ' '.join(splitStringsCapital).lower()
    return removeUnnecessarySpaces(joinedString)

In [21]:
from ollama import Client

def meltemiTranslateInfo(prompt, modelName):
  client = Client(host='http://10.8.11.209:11434')
  response = client.chat(model= modelName, messages=[
    {
      'role': 'user',
      'content': prompt
    },
  ])
  return response['message']['content']

In [22]:
import openai

def gptTranslateInfo(systemContent, inputInfo): 
    openai.api_base = 'https://pta-nbg-poc1.openai.azure.com/'
    openai.api_key = '89423897cfd94a89838586e836d26690'
    deployment_name = 'got-35-turbo1'
    openai.api_type = 'azure'
    openai.api_version = '2023-03-15-preview' # this may change in the future

    # systemContent = '''I will give you english column names from tables in english and you will translate it in greek. 
    # Please return only the translated column.'''

    # inputInfo = '''Translate the column "assets in million" to greek to fit the context of the table:
    # Table: company
    # Columns: company id, name, headquarters, industry, sales in million, assets in million'''
    try:
        response = openai.ChatCompletion.create(
            engine=deployment_name, # The deployment name you chose when you deployed the GPT-3.5-Turbo or GPT-4 model.
            temperature=0.3,
            messages=[
                {"role": "system", "content": systemContent},
                {"role": "user", "content": inputInfo}
            ]
        )
        return response['choices'][0]['message']['content']
    except KeyError:
        return ""

In [9]:
ollamaModels = []#['meltemiUpdated', 'llama3']

In [23]:
# Read the tables.json
tableFileName = r'..\Spider\tables.json'
tablesData = loadJson(tableFileName)
tablesData[0].keys()

dict_keys(['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original'])

In [25]:
listWithDbId = []
listWithTableNameOriginalPerDbId = []

# Iterate over each database in tablesData
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']
    # Extract the table_names_original from the current database
    tableNamesOriginal = database['table_names_original']
    
    # Iterate over each tableNameOriginal in tableNamesOriginal
    for tableNameOriginal in tableNamesOriginal:
        # Append the dbId to listWithDbId
        listWithDbId.append(dbId)
        # Append the tableNameOriginal to listWithTableNameOriginalPerDbId
        listWithTableNameOriginalPerDbId.append(tableNameOriginal)

In [13]:
# with open('BackUp/translatedTables624.json', 'r') as f:
#     dictWithTranslatedTableName  = json.load(f)

In [18]:
dictWithTranslatedTableName = {}

systemContent = 'I will give you table names from a schema in english and you will translate it to greek. Please return only the translated table name.'

totalNumOfTableNames = len(listWithTableNameOriginalPerDbId)
percent = int(totalNumOfTableNames * 0.1 )  # Calculate 10% of the total tables

# Iterate over each tableName in listWithTableNameOriginalPerDbId
for index, tableName in enumerate(listWithTableNameOriginalPerDbId, start=1):
    # Call the function to translate the tableName
    gptTranslatedTableName = removeTextInParentheses(gptTranslateInfo(systemContent, tableName))
    if 'Gpt' not in dictWithTranslatedTableName.keys():
        dictWithTranslatedTableName['Gpt'] = [gptTranslatedTableName]
    elif 'Gpt' in dictWithTranslatedTableName.keys():
        dictWithTranslatedTableName['Gpt'].append(gptTranslatedTableName)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedTableName = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {tableName}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedTableName.keys():
            dictWithTranslatedTableName[ollamaModelName] = [ollamaTranslatedTableName]
        elif ollamaModelName in dictWithTranslatedTableName.keys():
            dictWithTranslatedTableName[ollamaModelName].append(ollamaTranslatedTableName)
    
    if index % percent == 0:
        # Writing dictionary to a file
        with open(f'BackUp/translatedTables{len(dictWithTranslatedTableName["Gpt"])}.json', 'w') as f:
            json.dump(dictWithTranslatedTableName, f)
        print(f"{index}/{totalNumOfTableNames} of table names examined.")
        #break

87/876 of table names examined.
174/876 of table names examined.
261/876 of table names examined.
348/876 of table names examined.
435/876 of table names examined.
522/876 of table names examined.
609/876 of table names examined.
696/876 of table names examined.
783/876 of table names examined.
870/876 of table names examined.


In [1]:
tablesDf = pd.DataFrame(dictWithTranslatedTableName)
# Add prefix to column names
prefix = 'table_names_original_translated_'
tablesDf.columns = [prefix + col for col in tablesDf.columns]

tablesDf['db_id'] = listWithDbId
tablesDf['table_names_original'] = listWithTableNameOriginalPerDbId
tablesDf['table_names_original_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_names_original'] + [col for col in tablesDf.columns if col not in ['db_id', 'table_names_original']]

# Change the order of columns using reindex
tablesDf = tablesDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
tableOutputFileName = 'Translation/Table Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
tablesDf.to_excel(tableOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {tableOutputFileName} successfully.")

In [27]:
# Read the final table name DataFrame from the Excel file
tableOutputFileName = 'Translation/Table Translate Evaluation.xlsx'
tablesDf = pd.read_excel(tableOutputFileName)

# Add the translated table names to the original data
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']
    # Fetch the translated table names for the current db_id from tablesDf
    translatedTableNamesOriginal = fetchSpecificColumnForDbId(tablesDf, dbId, 'table_names_original_translated')

    listWithTablesNamesTranslated = []
    for tableName in translatedTableNamesOriginal:
        curatedTableName = splitAndJoin(tableName)
        if curatedTableName != '':
            listWithTablesNamesTranslated.append(curatedTableName)
        else:
            raise ValueError(f'Curated Table Name for {tableName} should not be empty')
        
    # Assign the translated table names to the 'table_names_translated' key in the original database data
    database['table_names_original_translated'] = translatedTableNamesOriginal
    database['table_names_translated'] = listWithTablesNamesTranslated

In [15]:
#Save the tables.json as it contains the translated table_names
translatedTableNamesPath = 'translatedTableNames.json'
writeJsonToFile(tablesData, translatedTableNamesPath)

JSON data written to translatedTableNames.json


In [None]:
# Read the translatedTableNames.json
translatedTableNamesPath = 'translatedTableNames.json'
tableFileName = translatedTableNamesPath
tablesData = loadJson(tableFileName)
tablesData[0].keys()

In [32]:
listWithDbId = []
listWithColumnNameOriginalPerDbId = []
listWithTableIdForColumnName = []
listWithTableNamesForColumnName = []

# Iterate over each database in tablesData
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']

    # Create a dictionary of tables with their indices as keys
    dictWithTables = createTableDict(database)
    
    # Iterate over each columnNameInfo in column_names_original of the current database
    for columnNameInfo in database['column_names_original']:

        # Append the dbId to listWithDbId
        listWithDbId.append(dbId)
        
        # Extract the columnNameOriginal from columnNameInfo and append it to listWithColumnNameOriginalPerDbId
        columnNameOriginal = columnNameInfo[1]
        listWithColumnNameOriginalPerDbId.append(columnNameOriginal)
        
        # Extract the tableId from columnNameInfo and append it to listWithTableIdForColumnName
        tableId = columnNameInfo[0]
        listWithTableIdForColumnName.append(tableId)

        # If tableId is -1, append '-1' to listWithTableNamesForColumnName, indicating unknown table
        if tableId == -1:
            listWithTableNamesForColumnName.append('-1')
        else:
            # Otherwise, append the table name corresponding to tableId to listWithTableNamesForColumnName
            listWithTableNamesForColumnName.append(dictWithTables[tableId])


In [31]:
# Create an empty dictionary to store table names as keys and corresponding column names as values.
dictWitnTablesAndColumnsNames = {}

# Iterate through pairs of table names and column names.
for tableName, columnName in zip(listWithTableNamesForColumnName, listWithColumnNameOriginalPerDbId):
    # Check if the table name is not already in the dictionary.
    if tableName not in dictWitnTablesAndColumnsNames.keys():
        # If not, initialize the entry with a list containing the current column name.
        dictWitnTablesAndColumnsNames[tableName] = [columnName]
    else:
        # If the table name is already in the dictionary, append the current column name to its list of column names.
        dictWitnTablesAndColumnsNames[tableName].append(columnName)

In [32]:
def createTheDesiredPromtForColumnsTranslation(tableName, dictWitnTablesAndColumnsNames, columnName):
    # Retrieve the list of column names associated with the given table name.
    columnsInfo = dictWitnTablesAndColumnsNames[tableName]

    # Join the list of column names into a comma-separated string.
    columnCombination = ', '.join(columnsInfo)
    
    # Construct and return the prompt string for translating the given column name to Greek.
    return f'''Translate the column "{columnName}" to Greek to fit the context of the table:
            Table: {tableName}
            Columns: {columnCombination}'''

In [121]:
dictWithTranslatedColumnName = {}

systemContent = 'I will give you english column names from tables and you will translate it to greek. Please return only the translated column.'

totalNumOfColumnNames = len(listWithColumnNameOriginalPerDbId)
percent = int(totalNumOfColumnNames * 0.1)  # Calculate 1% of the total tables

# Iterate over each columnName in listWithColumnNameOriginalPerDbId
for index, (tableName, columnName) in enumerate(zip(listWithTableNamesForColumnName, listWithColumnNameOriginalPerDbId), start=1):
    # Otherwise, call the function to translate the columnName
    prompt = createTheDesiredPromtForColumnsTranslation(tableName, dictWitnTablesAndColumnsNames, columnName)
    gptTranslatedColumnName = removeTextInParentheses(gptTranslateInfo(systemContent, prompt))
    if 'Gpt' not in dictWithTranslatedColumnName.keys():
        dictWithTranslatedColumnName['Gpt'] = [gptTranslatedColumnName]
    elif 'Gpt' in dictWithTranslatedColumnName.keys():
        dictWithTranslatedColumnName['Gpt'].append(gptTranslatedColumnName)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedColumnName = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {prompt}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedColumnName.keys():
            dictWithTranslatedColumnName[ollamaModelName] = [ollamaTranslatedColumnName]
        elif ollamaModelName in dictWithTranslatedColumnName.keys():
            dictWithTranslatedColumnName[ollamaModelName].append(ollamaTranslatedColumnName)

    if index % percent == 0:
        with open(f'BackUp/translatedColumns{len(dictWithTranslatedColumnName["Gpt"])}.json', 'w') as f:
            json.dump(dictWithTranslatedTableName, f)
        print(f"{index}/{totalNumOfColumnNames} of column names examined.")

46/461 of column names examined.
92/461 of column names examined.
138/461 of column names examined.
184/461 of column names examined.
230/461 of column names examined.
276/461 of column names examined.
322/461 of column names examined.
368/461 of column names examined.
414/461 of column names examined.
460/461 of column names examined.


In [122]:
columnsDf = pd.DataFrame(dictWithTranslatedColumnName)
# Add prefix to column names
prefix = 'column_names_original_translated_'
columnsDf.columns = [prefix + col for col in columnsDf.columns]

columnsDf['db_id'] = listWithDbId
columnsDf['table_id'] = listWithTableIdForColumnName
columnsDf['table_name'] = listWithTableNamesForColumnName
columnsDf['column_names_original'] = listWithColumnNameOriginalPerDbId
columnsDf['column_names_original_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_id', 'table_name', 'column_names_original'] + [col for col in columnsDf.columns if col not in ['db_id', 'table_id', 'table_name', 'column_names_original']]

# Change the order of columns using reindex
columnsDf = columnsDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
columnOutputFileName = 'Translation/Column Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
columnsDf.to_excel(columnOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {columnOutputFileName} successfully.")

DataFrame has been written to Column Translate Evaluation.xlsx successfully.


In [34]:
# Read the DataFrame containing translated column names from the Excel file
columnOutputFileName = 'Translation/Column Translate Evaluation.xlsx'
columnsDf = pd.read_excel(columnOutputFileName)

# Add the translated column names to the original data
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']

    # Initialize lists to store translated column names, table IDs, and column info
    listWithTranslatedColumnName = []
    listWithTableIdForColumnName = []
    listWithColumnInfoOriginal = []
    listWithColumnInfo = []

    # Fetch the table IDs and translated column names for the current db_id from columnsDf
    listWithTableIdForColumnName = fetchSpecificColumnForDbId(columnsDf, dbId, 'table_id')
    print()
    listWithTranslatedColumnName = fetchSpecificColumnForDbId(columnsDf, dbId, 'column_names_original_translated')
    
    # Iterate over the translated column names and table IDs
    for i in range(len(listWithTranslatedColumnName)):
        # Append the table ID and translated column name as a list to listWithColumnInfoOriginal
        listWithColumnInfoOriginal.append([listWithTableIdForColumnName[i], listWithTranslatedColumnName[i]])

        curatedColumnName = splitAndJoin(listWithTranslatedColumnName[i])
        if curatedColumnName != '':
            listWithColumnInfo.append([listWithTableIdForColumnName[i], curatedColumnName])
        else:
            raise ValueError(f'Curated Column Name for table {listWithTableIdForColumnName[i]} and Column {curatedColumnName} should not be empty')

    # Assign the listWithColumnInfo to the 'column_names_translated' key in the original database data
    database['column_names_original_translated'] = listWithColumnInfoOriginal
    database['column_names_translated'] = listWithColumnInfo























In [35]:
for table in tablesData:
    #table['table_names_original'] = table.pop('table_names_original_translated')
    table['table_names'] = table.pop('table_names_translated')
    #table['column_names_original'] = table.pop('column_names_original_translated')
    table['column_names'] = table.pop('column_names_translated')
    del table['table_names_original_translated']
    del table['column_names_original_translated']


In [36]:
#Save the updated translatedTableandColumnNames.json
translatedTableandColumnNamesPath = 'translatedTableandColumnNames.json'
writeJsonToFile(tablesData, 'translatedTableandColumnNames.json')

JSON data written to translatedTableandColumnNames.json


# Translate dev.json

In [None]:
translatedTableandColumnNamesPath = 'translatedTableandColumnNames.json'
tableFileName = translatedTableandColumnNamesPath
tablesData = loadJson(tableFileName)
tablesData[0].keys()

In [11]:
# Read the dev.json
devFileName = r'..\Spider\dev.json'
devDataset = loadJson(devFileName)
devDataset[0].keys()

dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql'])

In [20]:
dictWithTablesData = {}

# Create a dictionary where db_id is the key and the corresponding jsonSchema is the value
for jsonSchema in tablesData:
    dictWithTablesData[jsonSchema['db_id']] = jsonSchema

In [21]:
def createTableDict(database):
    dictWithTables = {}
    # Extract table names from the database schema (assuming they are stored in 'table_names')
    tablesNames = database['table_names_translated']
    
    # Iterate over the table names and assign each a unique index as the key in the dictionary
    for index, tableName in enumerate(tablesNames):
        dictWithTables[index] = tableName
    
    return dictWithTables

In [22]:
def returnTableAndColumnInfoForDbId(dictWithTablesData, dbId):
    # Retrieve the database schema corresponding to the given dbId
    database = dictWithTablesData[dbId]
    # Create a dictionary of tables with their indices as keys
    dictWithTables = createTableDict(database)

    # Initialize a list to store table and column information
    listWithTableAndColumnInfo = []

    # Iterate over the column information in the database schema
    for index, columnName in database['column_names_translated']:
        # Check if the column index is valid (-1 indicates unknown)
        if index != -1:
            # Retrieve the table name corresponding to the index from the dictionary
            tableName = dictWithTables[index]
            # Append the table name and column name to the list
            listWithTableAndColumnInfo.append([tableName, columnName])

    return listWithTableAndColumnInfo

In [23]:
def createSchemaInfoDict(listWithTableAndColumnInfo):
    # Create an empty dictionary to store table names as keys and corresponding column names as values.
    dictWithSchemaInfo = {}

    # Iterate through pairs of table names and column names.
    for tableName, columnName in listWithTableAndColumnInfo:
        # Check if both the table name and column name are not special values.
        if tableName != -1 and columnName != '*':
            # Check if the table name is not already in the dictionary.
            if tableName not in dictWithSchemaInfo.keys():
                # If not, initialize the entry with a list containing the current column name.
                dictWithSchemaInfo[tableName] = [columnName]
            else:
                # If the table name is already in the dictionary, append the current column name to its list of column names.
                dictWithSchemaInfo[tableName].append(columnName)
    return dictWithSchemaInfo

In [24]:
def createTheDesiredPromtForQuestionsTranslation(question, dictWithSchemaInfo):
    infoString = ""

    for tableName, columnNames in dictWithSchemaInfo.items():
        infoString += f"Table: {tableName}\n"
        infoString += f"Columns: {', '.join(columnNames)}\n"
    
    return f'''Translate the question "{question}" to Greek. You can only use keywords provided below:\n{infoString}'''

In [25]:
listWithDbId = []
listWithOriginalQuestionPerDbId = []
dictWithTranslatedQuestionPerDbId = {}
listWithTableAndColumnInfoForQuestion = []

systemContent = 'I will give you english questions and you will translate it to greek. Please return only the translated question.'

totalNumOfQuestions = len(devDataset)
percent = int(totalNumOfQuestions * 0.1)  # Calculate 1% of the total tables

# Iterate over each questionInfo in devDataset
for index, questionInfo in enumerate(devDataset, start =1):
    # Extract the db_id from the questionInfo
    dbId = questionInfo['db_id']
    # Append the dbId to listWithDbId
    listWithDbId.append(dbId)
    # Retrieve table and column information for the current dbId
    listWithTableAndColumnInfo = returnTableAndColumnInfoForDbId(dictWithTablesData, dbId)
    dictWithSchemaInfo = createSchemaInfoDict(listWithTableAndColumnInfo)
    # Append the table and column information to listWithTableAndColumnInfoForQuestion
    listWithTableAndColumnInfoForQuestion.append(listWithTableAndColumnInfo)
    # Extract the original question from questionInfo and append it to listWithOriginalQuestionPerDbId
    question = questionInfo['question']
    listWithOriginalQuestionPerDbId.append(question)
    
    prompt = createTheDesiredPromtForQuestionsTranslation(question, dictWithSchemaInfo)
    # Translate the question 
    GptTranslatedQuestion = removeTextInParentheses(gptTranslateInfo(systemContent, prompt))
    if 'Gpt' not in dictWithTranslatedQuestionPerDbId.keys():
        dictWithTranslatedQuestionPerDbId['Gpt'] = [GptTranslatedQuestion]
    elif 'Gpt' in dictWithTranslatedQuestionPerDbId.keys():
        dictWithTranslatedQuestionPerDbId['Gpt'].append(GptTranslatedQuestion)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedQuestion = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {prompt}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedQuestionPerDbId.keys():
            dictWithTranslatedQuestionPerDbId[ollamaModelName] = [ollamaTranslatedQuestion]
        elif ollamaModelName in dictWithTranslatedQuestionPerDbId.keys():
            dictWithTranslatedQuestionPerDbId[ollamaModelName].append(ollamaTranslatedQuestion)    

    if index % percent == 0:
        # Writing dictionary to a file
        with open(f'BackUp/translatedQuestions{len(dictWithTranslatedQuestionPerDbId["Gpt"])}.json', 'w') as f:
            json.dump(dictWithTranslatedQuestionPerDbId, f)
        print(f"{index}/{totalNumOfQuestions} of questions have been examined.")

Translate the question "How many singers do we have?" to Greek. You can only use keywords provided below:
Table: στάδιο
Columns: αναγνωριστικό σταδίου, τοποθεσία, όνομα, χωρητικότητα, υψηλότερο, χαμηλότερο, μέσος όρος
Table: τραγουδιστής
Columns: αναγνωριστικό τραγουδιστή, όνομα, χώρα, όνομα τραγουδιού, έτος κυκλοφορίας τραγουδιού, ηλικία, είναι άνδρας
Table: συναυλία
Columns: αναγνωριστικό συναυλίας, όνομα συναυλίας, θέμα, αναγνωριστικό σταδίου, έτος
Table: τραγουδιστής σε συναυλία
Columns: αναγνωριστικό συναυλίας, αναγνωριστικό τραγουδιστή



In [69]:
questionsDf = pd.DataFrame(dictWithTranslatedQuestionPerDbId)
# Add prefix to column names
prefix = 'question_translated_'
questionsDf.columns = [prefix + col for col in questionsDf.columns]

questionsDf['db_id'] = listWithDbId
questionsDf['table_info'] = listWithTableAndColumnInfoForQuestion
questionsDf['question'] = listWithOriginalQuestionPerDbId
questionsDf['question_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_info', 'question'] + [col for col in questionsDf.columns if col not in ['db_id', 'table_info', 'question']]

# Change the order of columns using reindex
questionsDf = questionsDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
questionsOutputFileName = 'Translation/Question Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
questionsDf.to_excel(questionsOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {questionsOutputFileName} successfully.")

DataFrame has been written to Question Translate Evaluation.xlsx successfully.


# Add question tokens to final json

In [12]:
import nltk

def tokenizeSentence(sentence):
    # Tokenize the input sentence using nltk.word_tokenize()
    tokens = nltk.word_tokenize(sentence)
    return tokens

In [15]:
# Read the final question DataFrame from the Excel file
questionsOutputFileName = 'Translation/Question Translate Evaluation.xlsx'
questionsDf = pd.read_excel(questionsOutputFileName)

# Add the translated question and its tokenized form to the original data
for questionInfo in devDataset:
    # Extract the db_id and original question from the questionInfo
    dbId = questionInfo['db_id']
    question = questionInfo['question']
    
    # Filter questionsDf to find the row corresponding to the current dbId and original question
    filteredQuestionsDf = questionsDf[(questionsDf['db_id'] == dbId) & (questionsDf['question'] == question)]
    
    # Extract the translated question from the filtered DataFrame
    questionTranslated = filteredQuestionsDf['question_translated'].values[0]
    
    # Update questionInfo with the translated question and its tokenized form
    questionInfo['question_translated'] = questionTranslated
    questionInfo['question_toks_translated'] = tokenizeSentence(questionTranslated)

In [16]:
for questionInfo in devDataset:
    questionInfo['question'] = questionInfo.pop('question_translated')
    questionInfo['question_toks'] = questionInfo.pop('question_toks_translated')

In [17]:
#Save the updated dev.json
writeJsonToFile(devDataset, 'devGreek.json')

JSON data written to devGreek.json
