# Translate tables.json

In [1]:
import json

# Define a function to load JSON data from a file
def loadJson(filename):
    # Open the specified file in read mode, specifying utf-8 encoding
    with open(filename, 'r', encoding='utf-8') as file:
        # Load the JSON data from the file into a Python dictionary
        data = json.load(file)
    # Return the loaded data
    return data

In [2]:
import pandas as pd

def fetchSpecificColumnForDbId(df, specificDbId, columnName):
    # Filter rows based on the specific db_id
    filteredDf = df[df['db_id'] == specificDbId]

    # Extract the specified column from the filtered DataFrame
    column_data = filteredDf[columnName]

    # Convert the column data to a list
    column_list = column_data.tolist()

    return column_list

In [3]:
def createTableDict(database):
    dictWithTables = {}
    # Extract table names from the database schema (assuming they are stored in 'table_names')
    tablesNames = database['table_names']  # TODO: Change to 'table_names_translated' once available
    
    # Iterate over the table names and assign each a unique index as the key in the dictionary
    for index, tableName in enumerate(tablesNames):
        dictWithTables[index] = tableName
    
    return dictWithTables

In [4]:
import re

def removeTextInParentheses(inputString):
    # Define a regex pattern to match text within parentheses
    pattern = r'\([^)]*\)'
    
    # Use re.sub() to replace the matched pattern with an empty string
    outputString = re.sub(pattern, '', inputString)
    
    return outputString

In [5]:
from ollama import Client

def meltemiTranslateInfo(prompt, modelName):
  client = Client(host='http://10.8.11.209:11434')
  response = client.chat(model= modelName, messages=[
    {
      'role': 'user',
      'content': prompt
    },
  ])
  return response['message']['content']

In [6]:
import openai

def gptTranslateInfo(systemContent, inputInfo): 
    openai.api_base = 'https://pta-nbg-poc1.openai.azure.com/'
    openai.api_key = '89423897cfd94a89838586e836d26690'
    deployment_name = 'got-35-turbo1'
    openai.api_type = 'azure'
    openai.api_version = '2023-03-15-preview' # this may change in the future

    # systemContent = '''I will give you english column names from tables in english and you will translate it in greek. 
    # Please return only the translated column.'''

    # inputInfo = '''Translate the column "assets in million" to greek to fit the context of the table:
    # Table: company
    # Columns: company id, name, headquarters, industry, sales in million, assets in million'''

    response = openai.ChatCompletion.create(
        engine=deployment_name, # The deployment name you chose when you deployed the GPT-3.5-Turbo or GPT-4 model.
        temperature=0.3,
        messages=[
            {"role": "system", "content": systemContent},
            {"role": "user", "content": inputInfo}
        ]
    )

    return response['choices'][0]['message']['content']

In [7]:
ollamaModels = ['meltemiUpdated', 'llama3']

In [8]:
# Read the tables.json
tableFileName = r'..\Spider\tables.json'
tablesData = loadJson(tableFileName)
tablesData[0].keys()

dict_keys(['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original'])

In [9]:
listWithDbId = []
listWithTableNameOriginalPerDbId = []

# Iterate over each database in tablesData
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']
    # Extract the table_names_original from the current database
    tableNamesOriginal = database['table_names_original']
    
    # Iterate over each tableNameOriginal in tableNamesOriginal
    for tableNameOriginal in tableNamesOriginal:
        # Append the dbId to listWithDbId
        listWithDbId.append(dbId)
        # Append the tableNameOriginal to listWithTableNameOriginalPerDbId
        listWithTableNameOriginalPerDbId.append(tableNameOriginal)

In [10]:
dictWithTranslatedTableName = {}

systemContent = 'I will give you table names from a schema in english and you will translate it to greek. Please return only the translated table name.'

totalNumOfTableNames = len(listWithTableNameOriginalPerDbId)
tenPercent = totalNumOfTableNames // 100  # Calculate 1% of the total tables

# Iterate over each tableName in listWithTableNameOriginalPerDbId
for index, tableName in enumerate(listWithTableNameOriginalPerDbId, start=1):
    # Call the function to translate the tableName
    gptTranslatedTableName = removeTextInParentheses(gptTranslateInfo(systemContent, tableName))
    if 'Gpt' not in dictWithTranslatedTableName.keys():
        dictWithTranslatedTableName['Gpt'] = [gptTranslatedTableName]
    elif 'Gpt' in dictWithTranslatedTableName.keys():
        dictWithTranslatedTableName['Gpt'].append(gptTranslatedTableName)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedTableName = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {tableName}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedTableName.keys():
            dictWithTranslatedTableName[ollamaModelName] = [ollamaTranslatedTableName]
        elif ollamaModelName in dictWithTranslatedTableName.keys():
            dictWithTranslatedTableName[ollamaModelName].append(ollamaTranslatedTableName)
    
    # Check if approximately 10% of tables have been examined
    if index % tenPercent == 0:
        print(f"{index}/{totalNumOfTableNames} of table names examined.")
        break

8/876 of table names examined.


In [11]:
dictWithTranslatedTableName

{'Gpt': ['Δράστης',
  'άνθρωποι',
  'αίθουσα διδασκαλίας',
  'τμήμα',
  'μάθημα',
  'καθηγητής',
  'τμήμα',
  'διδάσκει'],
 'meltemiUpdated': ['perpetrator ',
  'Πίνακας: άνθρωποι \n\nΜεταφρασμένο σε Ελληνικά: Άνθρωποι ',
  'classroom  = αίθουσα διδασκαλίας',
  'τμήμα',
  'Πρόγραμμα σπουδών ',
  'instructor ',
  'Είσαι ένας βοηθός που βοηθά τους ανθρώπους να βρίσκουν πληροφορίες. Θα σας δώσω μια αγγλική πρόταση και θα μεταφράσετε σε ελληνικά. Παρακαλώ απαντήστε μόνο με τη μετάφραση, χωρίς καμία εξήγηση ή πρόσθετες πληροφορίες.',
  'Ελληνικά: Διδάσκει '],
 'llama3': ['Εγκληματίας ',
  'Λαοί\n\n',
  'Ταξινόμος Classroom ',
  'Τμήμα ',
  'Πρόγραμμα ',
  'Επιμελητής ',
  'Εκτομή ',
  'Εκδίδωσ ']}

In [12]:
tablesDf = pd.DataFrame(dictWithTranslatedTableName)
# Add prefix to column names
prefix = 'table_names_translated_'
tablesDf.columns = [prefix + col for col in tablesDf.columns]

tablesDf['db_id'] = listWithDbId[:8] # TODO Remove the limit
tablesDf['table_names_original'] = listWithTableNameOriginalPerDbId[:8] # TODO Remove the limit
tablesDf['table_names_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_names_original'] + [col for col in tablesDf.columns if col not in ['db_id', 'table_names_original']]

# Change the order of columns using reindex
tablesDf = tablesDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
tableOutputFileName = 'Table Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
tablesDf.to_excel(tableOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {tableOutputFileName} successfully.")

DataFrame has been written to Table Translate Evaluation.xlsx successfully.


In [13]:
# # Creating a DataFrame with three columns: db_id, table_names_original, and table_names_translated
# tablesDf = pd.DataFrame({
#     'db_id': listWithDbId,
#     'table_names_original': listWithTableNameOriginalPerDbId,
#     'table_names_translated': listWithTranslatedTableName
# })

# # Define the output file name for the Excel file
# tableOutputFileName = 'Table Translate Evaluation.xlsx'

# # Write the DataFrame to an Excel file without including the index
# tablesDf.to_excel(tableOutputFileName, index=False)

# # Print a confirmation message indicating successful writing of the DataFrame to the Excel file
# print(f"DataFrame has been written to {tableOutputFileName} successfully.")

In [14]:
# Read the final table name DataFrame from the Excel file
tablesDf = pd.read_excel(tableOutputFileName)

# Add the translated table names to the original data
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']
    # Fetch the translated table names for the current db_id from tablesDf
    translated_table_names = fetchSpecificColumnForDbId(tablesDf, dbId, 'table_names_translated')
    # Assign the translated table names to the 'table_names_translated' key in the original database data
    database['table_names_translated'] = translated_table_names
    break #TODO: Remove the break

In [15]:
#Save the tables.json as it contains the translated table_names

In [16]:
listWithDbId = []
listWithColumnNameOriginalPerDbId = []
listWithTableIdForColumnName = []
listWithTableNamesForColumnName = []

# Iterate over each database in tablesData
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']

    # Create a dictionary of tables with their indices as keys
    dictWithTables = createTableDict(database)
    
    # Iterate over each columnNameInfo in column_names_original of the current database
    for columnNameInfo in database['column_names_original']:

        # Append the dbId to listWithDbId
        listWithDbId.append(dbId)
        
        # Extract the columnNameOriginal from columnNameInfo and append it to listWithColumnNameOriginalPerDbId
        columnNameOriginal = columnNameInfo[1]
        listWithColumnNameOriginalPerDbId.append(columnNameOriginal)
        
        # Extract the tableId from columnNameInfo and append it to listWithTableIdForColumnName
        tableId = columnNameInfo[0]
        listWithTableIdForColumnName.append(tableId)

        # If tableId is -1, append '-1' to listWithTableNamesForColumnName, indicating unknown table
        if tableId == -1:
            listWithTableNamesForColumnName.append('-1')
        else:
            # Otherwise, append the table name corresponding to tableId to listWithTableNamesForColumnName
            listWithTableNamesForColumnName.append(dictWithTables[tableId])
    
    # Remove the break to iterate over all databases
    break #TODO: Remove the break


In [17]:
# Create an empty dictionary to store table names as keys and corresponding column names as values.
dictWitnTablesAndColumnsNames = {}

# Iterate through pairs of table names and column names.
for tableName, columnName in zip(listWithTableNamesForColumnName, listWithColumnNameOriginalPerDbId):
    # Check if the table name is not already in the dictionary.
    if tableName not in dictWitnTablesAndColumnsNames.keys():
        # If not, initialize the entry with a list containing the current column name.
        dictWitnTablesAndColumnsNames[tableName] = [columnName]
    else:
        # If the table name is already in the dictionary, append the current column name to its list of column names.
        dictWitnTablesAndColumnsNames[tableName].append(columnName)

In [18]:
def createTheDesiredPromtForColumnsTranslation(tableName, dictWitnTablesAndColumnsNames, columnName):
    # Retrieve the list of column names associated with the given table name.
    columnsInfo = dictWitnTablesAndColumnsNames[tableName]

    # Join the list of column names into a comma-separated string.
    columnCombination = ', '.join(columnsInfo)
    
    # Construct and return the prompt string for translating the given column name to Greek.
    return f'''Translate the column "{columnName}" to Greek to fit the context of the table:
            Table: {tableName}
            Columns: {columnCombination}'''

In [19]:
dictWithTranslatedColumnName = {}

systemContent = 'I will give you english column names from tables and you will translate it to greek. Please return only the translated column.'

count = 0 # TODO Remove Count
# Iterate over each columnName in listWithColumnNameOriginalPerDbId
for tableName, columnName in zip(listWithTableNamesForColumnName, listWithColumnNameOriginalPerDbId):
    # Otherwise, call the function to translate the columnName
    prompt = createTheDesiredPromtForColumnsTranslation(tableName, dictWitnTablesAndColumnsNames, columnName)
    
    gptTranslatedColumnName = removeTextInParentheses(gptTranslateInfo(systemContent, prompt))
    if 'Gpt' not in dictWithTranslatedColumnName.keys():
        dictWithTranslatedColumnName['Gpt'] = [gptTranslatedColumnName]
    elif 'Gpt' in dictWithTranslatedColumnName.keys():
        dictWithTranslatedColumnName['Gpt'].append(gptTranslatedColumnName)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedColumnName = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {prompt}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedColumnName.keys():
            dictWithTranslatedColumnName[ollamaModelName] = [ollamaTranslatedColumnName]
        elif ollamaModelName in dictWithTranslatedColumnName.keys():
            dictWithTranslatedColumnName[ollamaModelName].append(ollamaTranslatedColumnName)

    count +=1 # TODO Remove Count
    if count == 5: # TODO Remove Count
        break # TODO Remove Count

In [20]:
columnsDf = pd.DataFrame(dictWithTranslatedColumnName)
# Add prefix to column names
prefix = 'column_names_translated_'
columnsDf.columns = [prefix + col for col in columnsDf.columns]

columnsDf['db_id'] = listWithDbId[:5] # TODO Remove the limit
columnsDf['table_id'] = listWithTableIdForColumnName[:5] # TODO Remove the limit
columnsDf['table_name'] = listWithTableNamesForColumnName[:5] # TODO Remove the limit
columnsDf['column_names_original'] = listWithColumnNameOriginalPerDbId[:5] # TODO Remove the limit
columnsDf['column_names_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_id', 'table_name', 'column_names_original'] + [col for col in columnsDf.columns if col not in ['db_id', 'table_id', 'table_name', 'column_names_original']]

# Change the order of columns using reindex
columnsDf = columnsDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
columnOutputFileName = 'Column Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
columnsDf.to_excel(columnOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {columnOutputFileName} successfully.")

DataFrame has been written to Column Translate Evaluation.xlsx successfully.


In [21]:
# # Creating a DataFrame with four columns: db_id, table_id, column_names_original, and column_names_translated
# columnsDf = pd.DataFrame({
#     'db_id': listWithDbId,
#     'table_id': listWithTableIdForColumnName,
#     'table_name': listWithTableNamesForColumnName,
#     'column_names_original': listWithColumnNameOriginalPerDbId,
#     'column_names_translated': listWithTranslatedColumnName
# })

# # Define the output file name for the Excel file
# columnOutputFileName = 'Column Translate Evaluation.xlsx'

# # Write the DataFrame to an Excel file without including the index
# columnsDf.to_excel(columnOutputFileName, index=False)

# # Print a confirmation message indicating successful writing of the DataFrame to the Excel file
# print(f"DataFrame has been written to {columnOutputFileName} successfully.")

In [22]:
tablesData[0].keys()

dict_keys(['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original', 'table_names_translated'])

In [23]:
# Read the DataFrame containing translated column names from the Excel file
columnsDf = pd.read_excel(columnOutputFileName)

# Initialize lists to store translated column names, table IDs, and column info
listWithTranslatedColumnName = []
listWithTableIdForColumnName = []
listWithColumnInfo = []

# Add the translated column names to the original data
for database in tablesData:
    # Extract the db_id of the current database
    dbId = database['db_id']
    
    # Fetch the table IDs and translated column names for the current db_id from columnsDf
    listWithTableIdForColumnName = fetchSpecificColumnForDbId(columnsDf, dbId, 'table_id')
    listWithTranslatedColumnName = fetchSpecificColumnForDbId(columnsDf, dbId, 'column_names_translated')
    
    # Iterate over the translated column names and table IDs
    for i in range(len(listWithTranslatedColumnName)):
        # Append the table ID and translated column name as a list to listWithColumnInfo
        listWithColumnInfo.append([listWithTableIdForColumnName[i], listWithTranslatedColumnName[i]])
    
    # Assign the listWithColumnInfo to the 'column_names_translated' key in the original database data
    database['column_names_translated'] = listWithColumnInfo
    
    break #TODO: Remove the break

In [24]:
tablesData[0]

{'column_names': [[-1, '*'],
  [0, 'perpetrator id'],
  [0, 'people id'],
  [0, 'date'],
  [0, 'year'],
  [0, 'location'],
  [0, 'country'],
  [0, 'killed'],
  [0, 'injured'],
  [1, 'people id'],
  [1, 'name'],
  [1, 'height'],
  [1, 'weight'],
  [1, 'home town']],
 'column_names_original': [[-1, '*'],
  [0, 'Perpetrator_ID'],
  [0, 'People_ID'],
  [0, 'Date'],
  [0, 'Year'],
  [0, 'Location'],
  [0, 'Country'],
  [0, 'Killed'],
  [0, 'Injured'],
  [1, 'People_ID'],
  [1, 'Name'],
  [1, 'Height'],
  [1, 'Weight'],
  [1, 'Home Town']],
 'column_types': ['text',
  'number',
  'number',
  'text',
  'number',
  'text',
  'text',
  'number',
  'number',
  'number',
  'text',
  'number',
  'number',
  'text'],
 'db_id': 'perpetrator',
 'foreign_keys': [[2, 9]],
 'primary_keys': [1, 9],
 'table_names': ['perpetrator', 'people'],
 'table_names_original': ['perpetrator', 'people'],
 'table_names_translated': [nan, nan],
 'column_names_translated': [[-1, nan],
  [0, nan],
  [0, nan],
  [0, nan],

In [25]:
#Save the updated tables.json

# Translate dev.json

In [26]:
# Read the tables.json
tableFileName = r'..\Spider\tables.json'
tablesData = loadJson(tableFileName)
tablesData[0].keys()

dict_keys(['column_names', 'column_names_original', 'column_types', 'db_id', 'foreign_keys', 'primary_keys', 'table_names', 'table_names_original'])

In [27]:
# Read the dev.json
devFileName = r'..\Spider\dev.json'
devDataset = loadJson(devFileName)
devDataset[0].keys()

dict_keys(['db_id', 'query', 'query_toks', 'query_toks_no_value', 'question', 'question_toks', 'sql'])

In [28]:
dictWithTablesData = {}

# Create a dictionary where db_id is the key and the corresponding jsonSchema is the value
for jsonSchema in tablesData:
    dictWithTablesData[jsonSchema['db_id']] = jsonSchema

In [29]:
def createTableDict(database):
    dictWithTables = {}
    # Extract table names from the database schema (assuming they are stored in 'table_names')
    tablesNames = database['table_names']  # TODO: Change to 'table_names_translated' once available
    
    # Iterate over the table names and assign each a unique index as the key in the dictionary
    for index, tableName in enumerate(tablesNames):
        dictWithTables[index] = tableName
    
    return dictWithTables

In [30]:
def returnTableAndColumnInfoForDbId(dictWithTablesData, dbId):
    # Retrieve the database schema corresponding to the given dbId
    database = dictWithTablesData[dbId]
    # Create a dictionary of tables with their indices as keys
    dictWithTables = createTableDict(database)

    # Initialize a list to store table and column information
    listWithTableAndColumnInfo = []

    # Iterate over the column information in the database schema
    for index, columnName in database['column_names']:  # TODO: Change to 'column_names_translated' once available
        # Check if the column index is valid (-1 indicates unknown)
        if index != -1:
            # Retrieve the table name corresponding to the index from the dictionary
            tableName = dictWithTables[index]
            # Append the table name and column name to the list
            listWithTableAndColumnInfo.append([tableName, columnName])

    return listWithTableAndColumnInfo

In [31]:
def createSchemaInfoDict(listWithTableAndColumnInfo):
    # Create an empty dictionary to store table names as keys and corresponding column names as values.
    dictWithSchemaInfo = {}

    # Iterate through pairs of table names and column names.
    for tableName, columnName in listWithTableAndColumnInfo:
        # Check if both the table name and column name are not special values.
        if tableName != -1 and columnName != '*':
            # Check if the table name is not already in the dictionary.
            if tableName not in dictWithSchemaInfo.keys():
                # If not, initialize the entry with a list containing the current column name.
                dictWithSchemaInfo[tableName] = [columnName]
            else:
                # If the table name is already in the dictionary, append the current column name to its list of column names.
                dictWithSchemaInfo[tableName].append(columnName)
    return dictWithSchemaInfo

In [32]:
def createTheDesiredPromtForQuestionsTranslation(question, dictWithSchemaInfo):
    infoString = ""

    for tableName, columnNames in dictWithSchemaInfo.items():
        infoString += f"Table: {tableName}\n"
        infoString += f"Columns: {', '.join(columnNames)}\n"
    
    return f'''Translate the question "{question}" to Greek. You can only use keywords provided below:\n{infoString}'''

In [33]:
listWithDbId = []
listWithOriginalQuestionPerDbId = []
dictWithTranslatedQuestionPerDbId = {}
listWithTableAndColumnInfoForQuestion = []

systemContent = 'I will give you english questions and you will translate it to greek. Please return only the translated question.'

count = 0 # TODO Remove Count
# Iterate over each questionInfo in devDataset
for questionInfo in devDataset:
    # Extract the db_id from the questionInfo
    dbId = questionInfo['db_id']
    # Append the dbId to listWithDbId
    listWithDbId.append(dbId)
    # Retrieve table and column information for the current dbId
    listWithTableAndColumnInfo = returnTableAndColumnInfoForDbId(dictWithTablesData, dbId)
    dictWithSchemaInfo = createSchemaInfoDict(listWithTableAndColumnInfo)
    # Append the table and column information to listWithTableAndColumnInfoForQuestion
    listWithTableAndColumnInfoForQuestion.append(listWithTableAndColumnInfo)
    # Extract the original question from questionInfo and append it to listWithOriginalQuestionPerDbId
    question = questionInfo['question']
    listWithOriginalQuestionPerDbId.append(question)

    prompt = createTheDesiredPromtForQuestionsTranslation(question, dictWithSchemaInfo)

    # Translate the question 
    GptTranslatedQuestion = removeTextInParentheses(gptTranslateInfo(systemContent, prompt))
    if 'Gpt' not in dictWithTranslatedQuestionPerDbId.keys():
        dictWithTranslatedQuestionPerDbId['Gpt'] = [GptTranslatedQuestion]
    elif 'Gpt' in dictWithTranslatedQuestionPerDbId.keys():
        dictWithTranslatedQuestionPerDbId['Gpt'].append(GptTranslatedQuestion)
    
    for ollamaModelName in ollamaModels:
        ollamaTranslatedQuestion = removeTextInParentheses(meltemiTranslateInfo(f'{systemContent} \n {prompt}', ollamaModelName))
        if ollamaModelName not in dictWithTranslatedQuestionPerDbId.keys():
            dictWithTranslatedQuestionPerDbId[ollamaModelName] = [ollamaTranslatedQuestion]
        elif ollamaModelName in dictWithTranslatedQuestionPerDbId.keys():
            dictWithTranslatedQuestionPerDbId[ollamaModelName].append(ollamaTranslatedQuestion)

    count +=1 # TODO Remove Count
    if count == 5: # TODO Remove Count
        break # TODO Remove Count

In [34]:
questionsDf = pd.DataFrame(dictWithTranslatedQuestionPerDbId)
# Add prefix to column names
prefix = 'question_translated_'
questionsDf.columns = [prefix + col for col in questionsDf.columns]

questionsDf['db_id'] = listWithDbId[:1] # TODO Remove the limit
questionsDf['table_info'] = listWithTableAndColumnInfoForQuestion[:1] # TODO Remove the limit
questionsDf['question_original'] = listWithOriginalQuestionPerDbId[:1] # TODO Remove the limit
questionsDf['question_translated'] = ''

# Define the new column order with "db_id" first and "table_names_original" second
newColumnOrder = ['db_id', 'table_info', 'question_original'] + [col for col in questionsDf.columns if col not in ['db_id', 'table_info', 'question_original']]

# Change the order of columns using reindex
questionsDf = questionsDf.reindex(columns=newColumnOrder)

# Define the output file name for the Excel file
questionsOutputFileName = 'Question Translate Evaluation.xlsx'

# Write the DataFrame to an Excel file without including the index
questionsDf.to_excel(questionsOutputFileName, index=False)

# Print a confirmation message indicating successful writing of the DataFrame to the Excel file
print(f"DataFrame has been written to {questionsOutputFileName} successfully.")

DataFrame has been written to Question Translate Evaluation.xlsx successfully.


In [35]:
# # Creating a DataFrame with four columns: db_id, table_info, question_original, and question_translated
# questionsDf = pd.DataFrame({
#     'db_id': listWithDbId,
#     'table_info': listWithTableAndColumnInfoForQuestion,
#     'question_original': listWithOriginalQuestionPerDbId,
#     'question_translated': listWithTranslatedQuestionPerDbId
# })

# # Define the output file name for the Excel file
# questionsOutputFileName = 'Question Translate Evaluation.xlsx'

# # Write the DataFrame to an Excel file without including the index
# questionsDf.to_excel(questionsOutputFileName, index=False)

# # Print a confirmation message indicating successful writing of the DataFrame to the Excel file
# print(f"DataFrame has been written to {questionsOutputFileName} successfully.")

# Add question tokens to final json

In [36]:
import nltk

def tokenizeSentence(sentence):
    # Tokenize the input sentence using nltk.word_tokenize()
    tokens = nltk.word_tokenize(sentence)
    return tokens

In [38]:
# Read the final question DataFrame from the Excel file
questionsDf = pd.read_excel(questionsOutputFileName)

# Add the translated question and its tokenized form to the original data
for questionInfo in devDataset:
    # Extract the db_id and original question from the questionInfo
    dbId = questionInfo['db_id']
    question = questionInfo['question']
    
    # Filter questionsDf to find the row corresponding to the current dbId and original question
    filteredQuestionsDf = questionsDf[(questionsDf['db_id'] == dbId) & (questionsDf['question_original'] == question)]
    
    # Extract the translated question from the filtered DataFrame
    questionTranslated = filteredQuestionsDf['question_translated'].values[0]
    
    # Update questionInfo with the translated question and its tokenized form
    questionInfo['question_translated'] = questionTranslated
    questionInfo['question_toks_translated'] = tokenizeSentence(questionTranslated)
    
    break #TODO: Remove the break

In [None]:
devDataset[0]

In [None]:
#Save the updated dev.json