In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import mysql.connector as mysql
from sqlalchemy import create_engine as dbengine

IMPUTATIONVALUES = "'A','B','C','D','G','H','J','K','L','N','P','R','Z',''"

In [2]:
def create_dbconnection():
    try:
        db = mysql.connect(host=os.getenv('MYSQL_SERVER'),
                            database=os.getenv('MYSQL_DATABASE'),
                            user=os.getenv('MYSQL_USER'),
                            password=os.getenv('MYSQL_PASSWORD'))
        return db, db.cursor()
    except mysql.Error as e:
        if e.errno == sqlError.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
            return None
        elif e.errno == sqlError.ER_BAD_DB_ERROR:
            print("Database does not exist")
            return None
        else:
            print(e)
            return None
    else:
        return None

In [3]:
def execute_dbquery(query, db=None, cursor=None):
    # If no db or cursor is provided, connect to the database
    if db is None or cursor is None:
        db, cursor = connect_to_db()

    try:
        cursor.execute(query)
        db.commit()
    except mysql.Error as e:
        print(f"Failed creating database with query: {query} - Error: {e}")
        exit(1)


In [4]:
def create_name_from_description(titles):
    titleList = ""
    for title in titles:
        title = title.replace(",", "").replace("'", "").replace(".", "").lower()
        titleList += f'[{title}],'
    message = {"role": "user", "content": f"{titleList}"}
    messages = [
        {
        "role": "system",
        "content": """
            Produce a list of readable database column names using the list of Column Descriptions provided.
            Column Names should NOT start with a number, be MySQL reserved word or Python keyword. 
            All special characters are to be replaced with an underscore and the Column Name should be all lowercase. 
            Make sure Column Names are readable, descriptive, concise, consistent, unique, and less than 30 characters including underscores.
            Return one entry for each Column Name submitted by the user.
            DO NOT return a numbered list.
            Substitute common words in the Column Descriptions with abbreviations such as:
                'inst' replaces 'institution'
                'id' replaces 'identification'
                'class' replaces 'classification'
                'cd' replaces 'code'
                'org' replaces 'organization'
                'loc' replaces 'location'
                'cat' replaces 'category'
                'url' replaces 'web address'
                'url' replaces 'website address'
                'rpt' replaces 'report'
            Returned Column Name(s) should be in square brackets."""
        },
        {
        "role": "user",
        "content": "[Unique identification number of the institution]"
        },
        {
        "role": "assistant",
        "content": "[uid]"
        },
        {
        "role": "user",
        "content": "[Institution (entity) name], [Institution name alias]"
        },
        {
        "role": "assistant",
        "content": "[inst_name], [inst_alias]"
        },
        {
        "role": "user",
        "content": """[Title of chief administrator], 
                [Disability Services Web Address], 
                [Institution's internet web address], 
                [Office of Postsecondary Education (OPE) ID Number], 
                [Sector of institution]"""
        },
        {
        "role": "assistant",
        "content": "[title_chief_administrator], [disability_url], [inst_url], [ope_id], [sector]"
        }
    ]
    messages.append(message)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0]['message']['content'].replace("[", "").replace("]", "").replace(" ", "").split(",")



In [5]:
def return_list_of_processed_tables():
    directory_path = 'dictionary/'
    file_pattern = '*.new.xlsx'

    files_list = glob.glob(directory_path + '/' + file_pattern)
    new_files_list = []
    for each_file in files_list:
        new_files_list.append(each_file.replace(directory_path, '').replace('.new.xlsx', '').upper())

    return new_files_list


In [7]:
### Read the dictionary files for the database
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TIM7020TableName, dbDictionary.TableName))
### Get a list of the tables that have already been processed
processedTables = return_list_of_processed_tables()
###
### For each table, generate the new variable names using OpenAI API
### Write the new variable names to a ".new." dictionary file, 
###  so we're not having to make repeated calls to the API
###
for myTableName, ipedsTableName in tableNames:
    if ipedsTableName.upper() not in processedTables:
        ### Processing the current table
        print(f'myTableName: {myTableName}, ipedsTableName: {ipedsTableName}')
        ### Read table's dictionary file
        tableDictionary = pd.read_excel(f'dictionary/{ipedsTableName}.xlsx', sheet_name=1)
        # Create list of column descriptions to be used as input to the API
        varibleTitles = tableDictionary.iloc[:,6].tolist()
        # Print the number of variables in the list of column names
        print(f'varibleTitles (length): {len(varibleTitles)}')
        # Group the column names into groups of 30 to make the API calls
        groups = [varibleTitles[i:i+20] for i in range(0, len(varibleTitles), 20)]
        varibleTitlesNew = []
        for group in groups:
            varibleTitlesNew.extend(create_name_from_description(group))
            time.sleep(5)
        # Print the number of variables in the list of new column names
        print(f'varibleTitlesNew (length): {len(varibleTitlesNew)}')
        # Append the new column names to the table's dictionary file
        tableDictionary['varname_new'] = varibleTitlesNew
        # Write the new dictionary file to disk
        tableDictionary.to_excel(f'dictionary/{ipedsTableName.lower()}.new.xlsx', sheet_name='varlist', index=False)


In [None]:
### Read the dictionary file
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TIM7020TableName, dbDictionary.TableName))

for myTableName, ipedsTableName in tableNames:
    ### Read the dictionary file
    print(f'myTableName: {myTableName}, ipedsTableName: {ipedsTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{ipedsTableName}.xlsx', sheet_name=1)
    # Create list of column descriptions less the first row (IPEDS ID)
    varibleTitles = tableDictionary['varTitle'].tolist()[1:]
    # Create list of new column names
    newVariableTitles = create_column_names(varibleTitles)
    ### Read first 100 rows of the data file to verify data types
    tableData = pd.read_csv(f'data/{ipedsTableName}.csv', nrows=100, encoding="utf-8", na_values=['.', ' '])
    ### Create string for the query
    query = f'ALTER TABLE {myTableName} \n'
    ### Loop through the rows of the table dictionary file
    for index, row in tableDictionary.iterrows():
        addColumn = None
        ## Skip the IPEDS ID
        if row.varnumber == 1:
            continue
        else:
            ## If there is an imputation variable, add it to the table
            if (isinstance(row.imputationvar, str) == True) and (row.imputationvar != 'None' and row.imputationvar[0].upper() == 'X'):
                # print(row.varnumber, row.varname, row.DataType, row.Fieldwidth, row.imputationvar, row.varTitle)
                addColumn = f'ADD COLUMN IF NOT EXISTS {row.imputationvar} ENUM({IMPUTATIONVALUES}),'

            ## If the column is an alpha column, add it to the table
            if (row.format == 'Alpha'):
                dataType = f'VARCHAR({row.Fieldwidth})'
            ## If the column is a continuous column, add it to the table
            elif (row.format == 'Cont'):
                ## Check to see if data type is integer or float
                if tableData[row.varname.upper()].dtype == 'float':
                    if row.Fieldwidth > 8:
                        dataType = 'DOUBLE'
                    else:
                        dataType = 'FLOAT'
                else:
                    if row.Fieldwidth > 8:
                        dataType = 'BIGINT'
                    elif row.Fieldwidth > 4:
                        dataType = 'INT'
                    elif row.Fieldwidth == 1:
                        dataType = 'TINYINT'
                if row.DataType == 'float':
                    dataType = 'REAL'
            ## If the column is a discrete column, add it to the table
            elif (row.format == 'Disc'):
                decreteValues = dbValues.loc[dbValues['varnumber'] == row.varnumber, 'value'].tolist()
                dataType = f'ENUM()'
            
            addColumn = f'ADD COLUMN IF NOT EXISTS {newVariableTitles[index]} {dataType},'
            query += addColumn + '\n'
    ## Remove the last comma and space, add a semicolon
    query = query[:-2] + ';'
    print(query)


    # ## DROP the table
    # execute_dbquery(f'DROP TABLE IF EXISTS {myTableName};', db=db, cursor=cursor)
    # ## Create the table
    # execute_dbquery(f'CREATE TABLE {myTableName} (ipeds_id INTEGER);', db=db, cursor=cursor)
    # ## Alter the table by adding the columns
    # execute_dbquery(query=query, db=db, cursor=cursor)

    break


In [None]:
# models = openai.Model.list()
# print(models)

In [None]:
# message=[
#         {"role": "system", "content": "You are a helpful database design assistant."},
#         {"role": "user", "content": prompt},
#     ]

# response = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo-16k-0613",
#   messages=message,
#   temperature=0,
#   max_tokens=13223
# )


In [None]:

# content = response.choices[0]['message']['content']
# # print(content)
# # Split the string into lines
# cols = content.strip().splitlines()
# # print(cols)
# # # Split each line at the colon (:) and extract the second part
# # entries = [line.split(":")[1].strip() for line in lines if ":" in line]
# id = ['ipeds_id']
# cols = id + cols[2:] 
# print(cols)