In [10]:
import os
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import mysql.connector as mysql
import pymysql
from sqlalchemy import create_engine
host=os.getenv('MYSQL_SERVER')
database=os.getenv('MYSQL_DATABASE')
user=os.getenv('MYSQL_USER')
password=os.getenv('MYSQL_PASSWORD')

IMPUTATIONVALUES = "'A','B','C','D','G','H','J','K','L','N','P','R','Z',''"

In [11]:
def create_dbengine():
    ### Create SQLAlchmey engine
    # Create the engine to connect to the MySQL database
    connect_args={'ssl':{'fake_flag_to_enable_tls': True}}
    return create_engine(f'mysql+pymysql://{user}:{password}@{host}:3306/{database}', connect_args=connect_args)

In [12]:
def create_dbconnection():
    try:
        db = mysql.connect(host=host, user=user, password=password, database=database)
        return db, db.cursor()
    except mysql.Error as e:
        if e.errno == mysql.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
            return None
        elif e.errno == mysql.ER_BAD_DB_ERROR:
            print("Database does not exist")
            return None
        else:
            print(e)
            return None
    else:
        return None

In [13]:
def execute_dbquery(query, db=None, cursor=None):
    # If no db or cursor is provided, connect to the database
    if db is None or cursor is None:
        db, cursor = create_dbconnection()

    try:
        cursor.execute(query)
        db.commit()
        # time.sleep(5)
    except mysql.Error as e:
        print(f"Failed creating database with query: {query} - Error: {e}")
        exit(1)


In [14]:
def create_name_from_description(titles):
    titleList = ""
    for title in titles:
        title = title.replace(",", "").replace("'", "").replace(".", "").lower()
        titleList += f'[{title}],'
    message = {"role": "user", "content": f"{titleList}"}
    messages = [
        {
        "role": "system",
        "content": """
            Produce a list of readable database column names using the list of Column Descriptions provided.
            Column Names should NOT start with a number, be MySQL reserved word or Python keyword. 
            All special characters are to be replaced with an underscore and the Column Name should be all lowercase. 
            Make sure Column Names are readable, descriptive, concise, consistent, unique, and less than 30 characters including underscores.
            Return one entry for each Column Name submitted by the user.
            DO NOT return a numbered list.
            Substitute common words in the Column Descriptions with abbreviations such as:
                'inst' replaces 'institution'
                'id' replaces 'identification'
                'class' replaces 'classification'
                'cd' replaces 'code'
                'org' replaces 'organization'
                'loc' replaces 'location'
                'cat' replaces 'category'
                'url' replaces 'web address'
                'url' replaces 'website address'
                'rpt' replaces 'report'
            Returned Column Name(s) should be in square brackets."""
        },
        {
        "role": "user",
        "content": "[Unique identification number of the institution]"
        },
        {
        "role": "assistant",
        "content": "[uid]"
        },
        {
        "role": "user",
        "content": "[Institution (entity) name], [Institution name alias]"
        },
        {
        "role": "assistant",
        "content": "[inst_name], [inst_alias]"
        },
        {
        "role": "user",
        "content": """[Title of chief administrator], 
                [Disability Services Web Address], 
                [Institution's internet web address], 
                [Office of Postsecondary Education (OPE) ID Number], 
                [Sector of institution]"""
        },
        {
        "role": "assistant",
        "content": "[title_chief_administrator], [disability_url], [inst_url], [ope_id], [sector]"
        }
    ]
    messages.append(message)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0]['message']['content'].replace("[", "").replace("]", "").replace(" ", "").split(",")



In [15]:
def return_list_of_processed_tables():
    directory_path = 'dictionary/'
    file_pattern = '*.new.xlsx'

    files_list = glob.glob(directory_path + '/' + file_pattern)
    new_files_list = []
    for each_file in files_list:
        new_files_list.append(each_file.replace(directory_path, '').replace('.new.xlsx', '').upper())

    return new_files_list


In [16]:
### 
### Process the IPEDS data tables and create new column names using OpenAI API
###
### Read the dictionary files for the database
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TIM7020TableName, dbDictionary.TableName))
### Get a list of the tables that have already been processed
processedTables = return_list_of_processed_tables()
###
### For each table, generate the new variable names using OpenAI API
### Write the new variable names to a ".new." dictionary file, 
###  so we're not having to make repeated calls to the API
###
for newTableName, ipedsTableName in tableNames:
    if ipedsTableName.upper() not in processedTables:
        ### Processing the current table
        print(f'newTableName: {newTableName}, ipedsTableName: {ipedsTableName}')
        ### Read table's dictionary file
        tableDictionary = pd.read_excel(f'dictionary/{ipedsTableName}.xlsx', sheet_name=1)
        # Create list of column descriptions to be used as input to the API
        varibleTitles = tableDictionary.iloc[:,6].tolist()
        # Print the number of variables in the list of column names
        print(f'varibleTitles (length): {len(varibleTitles)}')
        # Group the column names into groups of 30 to make the API calls
        groups = [varibleTitles[i:i+20] for i in range(0, len(varibleTitles), 20)]
        varibleTitlesNew = []
        for group in groups:
            varibleTitlesNew.extend(create_name_from_description(group))
            time.sleep(5)
        # Print the number of variables in the list of new column names
        print(f'varibleTitlesNew (length): {len(varibleTitlesNew)}')
        # Append the new column names to the table's dictionary file
        tableDictionary['varname_new'] = varibleTitlesNew
        # Write the new dictionary file to disk
        tableDictionary.to_excel(f'dictionary/{ipedsTableName.lower()}.new.xlsx', sheet_name='varlist', index=False)


In [17]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
###
### Read the dictionary file for the database
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TIM7020TableName, dbDictionary.TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for newTableName, ipedsTableName in tableNames:
    ### Read the dictionary file
    print(f'newTableName: {newTableName}, ipedsTableName: {ipedsTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{ipedsTableName}.new.xlsx', sheet_name='varlist')
    # Create list of column descriptions less the first row (IPEDS ID)
    variableNames = tableDictionary['varname_new'].tolist()
    ### Read first 100 rows of the data file to verify data types
    tableData = pd.read_csv(f'data/{ipedsTableName}.csv', nrows=100, encoding="utf-8", na_values=['.', ' '])
    ### Create string for the query
    createTableQuery = None
    alterTableQuery = f'ALTER TABLE {newTableName} \n'
    ###
    ### Loop through the rows of the table dictionary file
    ###
    for index, row in tableDictionary.iterrows():
        ### Setup the column creation variables
        addColumn = None
        addImputed = None
        addCheck = None
        dataType = None
        ### Skip the first row (inst_id), we'll create the table with this common column
        if index == 0:
            createTableQuery = f'CREATE TABLE {newTableName} ({row.varname_new} INTEGER);'
            continue

        ### If there is an imputation variable, add the column to the table
        if (isinstance(row.imputationvar, str) == True) and (row.imputationvar != 'None' and row.imputationvar[0].upper() == 'X'):
            ###
            ### Create the ADD COLUMN line for the imputed flag column to be added to the ALTER TABLE query
            ### 
            addImputed = f'  ADD COLUMN {row.varname_new}_imp ENUM({IMPUTATIONVALUES}) COMMENT "{row.varname}|{row.varTitle}",'

        ### Determine the data type to use for the field based on the metadata
        ###
        ### If the column is an alpha column, set the data type to VARCHAR for the specified field width
        ###
        if (row.format == 'Alpha'):
            dataType = f'VARCHAR({row.Fieldwidth})'

        ###
        ### If the column is a continuous column, determine the data type and field width
        ###
        elif (row.format == 'Cont'):
            # Check to see if data type is integer or float, but looking at how the data would import
            if tableData[row.varname.upper()].dtype == 'float':
                if row.Fieldwidth > 8:
                    dataType = 'DOUBLE'
                else:
                    dataType = 'FLOAT'
            else:
                if row.Fieldwidth > 8:
                    dataType = 'BIGINT'
                elif row.Fieldwidth > 6:
                    dataType = 'INT'
                elif row.Fieldwidth in [5, 6]:
                    dataType = 'MEDIUMINT'
                elif row.Fieldwidth in [3, 4]:
                    dataType = 'SMALLINT'
                elif row.Fieldwidth in [1, 2]:
                    dataType = 'TINYINT'
        ###
        ### If the column is a discrete column, 
        ###  determine if we need to create a CHECK contraint or if a Reference table needs to be created
        ###  with a foreign key constraint
        ###
        elif (row.format == 'Disc'):
            ### Get the list of discrete values from the metadata spreadsheet
            decreteValues = dbRefValues.loc[dbRefValues['varNumber'] == row.varnumber, 'Codevalue'].tolist()
            ### If there are less than 6 discrete values, create a CHECK constraint
            if len(decreteValues) < 6:
                ###
                ### Create the CHECK constraint line to be added to the ALTER TABLE query
                ###  
                addCheck = f'      ADD CHECK ({row.varname_new} IN ({", ".join(decreteValues)})),'
            ### If there are more than 6 discrete values, create a reference table
            else:
                ## TO DO: Create reference table query when there are more than 6 discrete values
                continue
            ###
            ### Set data type for the descrete column
            ###
            if (row.DataType == 'A'):
                dataType = f'VARCHAR({row.Fieldwidth})'
            elif tableData[row.varname.upper()].dtype == 'float':
                if row.Fieldwidth > 8:
                    dataType = 'DOUBLE'
                else:
                    dataType = 'FLOAT'
            else:
                if row.Fieldwidth > 8:
                    dataType = 'BIGINT'
                elif row.Fieldwidth > 1:
                    dataType = 'INT'
                else:
                    dataType = 'TINYINT'
        ###
        ### If the data type is still None, set it to TEXT
        ###  
        if dataType == None:
            print(f'**WARNING**: {row.varname_new} has no data type, setting to TEXT')
            dataType = 'TEXT'
        
        ###
        ### Create the ADD COLUMN line to be added to the ALTER TABLE query
        ###    
        addColumn = f'    ADD COLUMN {row.varname_new} {dataType} COMMENT "{row.varname}|{row.varTitle}",'

        ###
        ### Build the ALTER TABLE query for the current column
        ###
        addToQuery = ''
        if addImputed != None:
            addToQuery = addImputed + '\n'
        if addColumn != None:
            addToQuery += addColumn + '\n'
        if addCheck != None:
            addToQuery += addCheck + '\n'
        alterTableQuery += addToQuery

        ### Continue for the next row in the table dictionary file
        continue
    ##
    ## Remove the last comma and space, add a semicolon
    ##
    alterTableQuery = alterTableQuery[:-2] + ';'
    
    print(f'\n{createTableQuery}\n')
    print(f'\n{alterTableQuery}\n')

    ### DROP the table
    execute_dbquery(f'DROP TABLE IF EXISTS {newTableName};', db=db, cursor=cursor)
    ### Create the table
    execute_dbquery(query=createTableQuery, db=db, cursor=cursor)
    ### Alter the table by adding the columns
    execute_dbquery(query=alterTableQuery, db=db, cursor=cursor)

    loopCount += 1
    if loopCount > 2:
        break


newTableName: inst_ic_directory, ipedsTableName: HD2021

CREATE TABLE inst_ic_directory (inst_id INTEGER);


ALTER TABLE inst_ic_directory 
    ADD COLUMN inst_name VARCHAR(120) COMMENT "INSTNM|Institution (entity) name",
    ADD COLUMN inst_alias VARCHAR(2000) COMMENT "IALIAS|Institution name alias",
    ADD COLUMN address VARCHAR(100) COMMENT "ADDR|Street address or post office box",
    ADD COLUMN city VARCHAR(30) COMMENT "CITY|City location of institution",
    ADD COLUMN zip_code VARCHAR(10) COMMENT "ZIP|ZIP code",
    ADD COLUMN chief_admin_name VARCHAR(50) COMMENT "CHFNM|Name of chief administrator",
    ADD COLUMN chief_admin_title VARCHAR(50) COMMENT "CHFTITLE|Title of chief administrator",
    ADD COLUMN phone_number VARCHAR(15) COMMENT "GENTELE|General information telephone number",
    ADD COLUMN employer_id BIGINT COMMENT "EIN|Employer Identification Number",
    ADD COLUMN dun_bradstreet_numbers VARCHAR(2000) COMMENT "DUNS|Dun and Bradstreet numbers",
    ADD COLUMN ope_i

In [18]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
engine = create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TIM7020TableName, dbDictionary.TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for newTableName, ipedsTableName in tableNames:
    ### Read the dictionary file
    print(f'newTableName: {newTableName}, ipedsTableName: {ipedsTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{ipedsTableName}.new.xlsx', sheet_name='varlist')
    ### Create a dictionary of old column names and new column names
    columnNames = dict(zip(tableDictionary['varname'], tableDictionary['varname_new']))
    ### Read in the CSV data file
    tableData = pd.read_csv(f'data/{ipedsTableName.lower()}.csv', encoding='latin-1')
    ### Rename the columns  
    tableData = tableData.rename(columns=columnNames)
    ### Write data to database
    tableData.to_sql(name=newTableName, con=engine, if_exists='replace', index=False)

    loopCount += 1
    if loopCount > 2:
        break


newTableName: inst_ic_directory, ipedsTableName: HD2021
newTableName: inst_ic_response_flags, ipedsTableName: FLAGS2021
newTableName: inst_ic_offerings, ipedsTableName: IC2021
