In [None]:
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import mysql.connector as mysql
import pymysql
from sqlalchemy import create_engine
host=os.getenv('MYSQL_SERVER')
database=os.getenv('MYSQL_DATABASE')
user=os.getenv('MYSQL_USER')
password=os.getenv('MYSQL_PASSWORD')

IMPUTATIONVALUES = "'A','B','C','D','G','H','J','K','L','N','P','R','Z',''"

### Functions used within this notebook

In [None]:
def dtype_by_format(format, dtype, width):
    dataType = None
    ###
    ### If the column is an alpha column, set the data type to VARCHAR for the specified field width
    ###
    if format.upper() == 'A':
        dataType = f'VARCHAR({width})'

    ###
    ### If the column is not-alpha, determine the numeric data type and field width
    ###
    else:
        # Check to see if data type is integer or float, but looking at how the data would import
        if dtype == 'float':
            if width > 8:
                dataType = 'DOUBLE'
            else:
                dataType = 'FLOAT'
        else:
            if width > 8:
                dataType = 'BIGINT'
            elif width > 6:
                dataType = 'INT'
            elif width in [5, 6]:
                dataType = 'MEDIUMINT'
            elif width in [3, 4]:
                dataType = 'SMALLINT'
            elif width in [1, 2]:
                dataType = 'TINYINT'
            else:
                dataType = 'INT'
    ###
    ### If the data type is still None, set it to TEXT
    ###
    if dataType == None:
         print(f'**WARNING**: {row.varname_new} has no data type, setting to TEXT')
         dataType = 'TEXT'
    ###
    ### Return the data type
    ###
    return dataType

In [None]:
def dtype_by_datatype(dtype, length):
    if dtype == 'int64':
        return 'INTEGER'
    elif dtype == 'float64':
        return 'REAL'
    elif dtype == 'object':
        return 'TEXT'
    else:
        return 'TEXT'

In [None]:
def write_sql_file(sql_file, sql_query):
    with open(f'sql/{sql_file}.sql', 'w') as f:
        f.write(sql_query)

In [None]:
def create_dbengine():
    ### Create SQLAlchmey engine
    # Create the engine to connect to the MySQL database
    connect_args={'ssl':{'fake_flag_to_enable_tls': True}}
    return create_engine(f'mysql+pymysql://{user}:{password}@{host}:3306/{database}', connect_args=connect_args)

In [None]:
def create_dbconnection():
    try:
        db = mysql.connect(host=host, user=user, password=password, database=database)
        return db, db.cursor()
    except mysql.Error as e:
        if e.errno == mysql.ER_ACCESS_DENIED_ERROR:
            print("Something is wrong with your user name or password")
            return None
        elif e.errno == mysql.ER_BAD_DB_ERROR:
            print("Database does not exist")
            return None
        else:
            print(e)
            return None
    else:
        return None

In [None]:
def execute_dbquery(query, db=None, cursor=None):
    # If no db or cursor is provided, connect to the database
    if db is None or cursor is None:
        db, cursor = create_dbconnection()

    try:
        cursor.execute(query)
        db.commit()
        # time.sleep(5)
    except mysql.Error as e:
        print(f"Failed creating database with query: {query} - Error: {e}")
        exit(1)


In [None]:
###
### Create column names from the column descriptions using GPT-3.5-turbo OpenAI API
###
def create_name_from_description(titles):
    titleList = ""
    for title in titles:
        title = title.replace(",", "").replace("'", "").replace(".", "").lower()
        titleList += f'[{title}],'
    message = {"role": "user", "content": f"{titleList}"}
    messages = [
        {
        "role": "system",
        "content": """
            Produce a list of readable database column names using the list of Column Descriptions provided.
            Column Names should NOT start with a number, be MySQL reserved word or Python keyword. 
            All special characters are to be replaced with an underscore and the Column Name should be all lowercase. 
            Make sure Column Names are readable, descriptive, concise, consistent, unique, and less than 30 characters including underscores.
            Return one entry for each Column Name submitted by the user.
            DO NOT return a numbered list.
            Substitute common words in the Column Descriptions with abbreviations such as:
                'inst' replaces 'institution'
                'id' replaces 'identification'
                'class' replaces 'classification'
                'cd' replaces 'code'
                'org' replaces 'organization'
                'loc' replaces 'location'
                'cat' replaces 'category'
                'url' replaces 'web address'
                'url' replaces 'website address'
                'rpt' replaces 'report'
            Returned Column Name(s) should be in square brackets."""
        },
        {
        "role": "user",
        "content": "[Unique identification number of the institution]"
        },
        {
        "role": "assistant",
        "content": "[uid]"
        },
        {
        "role": "user",
        "content": "[Institution (entity) name], [Institution name alias]"
        },
        {
        "role": "assistant",
        "content": "[inst_name], [inst_alias]"
        },
        {
        "role": "user",
        "content": """[Title of chief administrator], 
                [Disability Services Web Address], 
                [Institution's internet web address], 
                [Office of Postsecondary Education (OPE) ID Number], 
                [Sector of institution]"""
        },
        {
        "role": "assistant",
        "content": "[title_chief_administrator], [disability_url], [inst_url], [ope_id], [sector]"
        }
    ]
    messages.append(message)

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=1024,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0]['message']['content'].replace("[", "").replace("]", "").replace(" ", "").split(",")



In [None]:
###
### Get list of dictionary files already processed through OpenAI API
###   - This is to avoid re-processing the same files again
###
def return_list_of_processed_tables():
    directory_path = 'dictionary/'
    file_pattern = '*.new.xlsx'

    files_list = glob.glob(directory_path + '/' + file_pattern)
    new_files_list = []
    for each_file in files_list:
        new_files_list.append(each_file.replace(directory_path, '').replace('.new.xlsx', '').upper())

    return new_files_list


### Remove and Create the /sql directory

In [None]:
directory = "sql"

# Remove the directory if it exists
if os.path.exists(directory):
    shutil.rmtree(directory)

# Create the directory
os.mkdir(directory)

### DROP & CREATE database

In [None]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
dropDBQuery = "DROP DATABASE IF EXISTS tim7020;"
createDBQuery = "CREATE DATABASE tim7020;"
execute_dbquery(query=dropDBQuery, db=db, cursor=cursor)
execute_dbquery(query=createDBQuery, db=db, cursor=cursor)

### Create new table names using OpenAI API using the column description

In [None]:
### 
### Process the IPEDS data tables and create new column names using OpenAI API
###

### Read the dictionary files for the database
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))
### Get a list of the tables that have already been processed
processedTables = return_list_of_processed_tables()
###
### For each table, generate the new variable names using OpenAI API
### Write the new variable names to a ".new." dictionary file, 
###  so we're not having to make repeated calls to the API
###
for oldTableName, newTableName in tableNames:
    if oldTableName.upper() not in processedTables:
        ### Record the table we are working on
        print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
        ### Read table's dictionary file
        tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.xlsx', sheet_name=1)
        # Create list of column descriptions to be used as input to the API
        varibleTitles = tableDictionary.iloc[:,6].tolist()
        # Print the number of variables in the list of column names
        print(f'varibleTitles (length): {len(varibleTitles)}')
        # Group the column names into groups of 30 to make the API calls
        groups = [varibleTitles[i:i+20] for i in range(0, len(varibleTitles), 20)]
        varibleTitlesNew = []
        for group in groups:
            varibleTitlesNew.extend(create_name_from_description(group))
            time.sleep(5)
        # Print the number of variables in the list of new column names
        print(f'varibleTitlesNew (length): {len(varibleTitlesNew)}')
        # Append the new column names to the table's dictionary file
        tableDictionary['varname_new'] = varibleTitlesNew
        # Write the new dictionary file to disk
        tableDictionary.to_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist', index=False)


### Build CREATE and ALTER TABLE queries for the database

In [None]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
###
### Read the dictionary file for the database
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    # Create list of column descriptions less the first row (IPEDS ID)
    variableNames = tableDictionary['varname_new'].tolist()
    ### Read first 100 rows of the data file to verify data types
    tableData = pd.read_csv(f'data/{oldTableName}.csv', nrows=100, encoding="utf-8", na_values=['.', '. ', ' '])
    ### Create string for the query
    createTableQuery = None
    alterTableQuery = f'ALTER TABLE {newTableName} \n'
    ###
    ### Loop through the rows of the table dictionary file
    ###
    for index, row in tableDictionary.iterrows():
        ### Setup the column creation variables
        addColumn = None
        addImputed = None
        addCheck = None
        dataType = None
        ### Skip the first row (inst_id), we'll create the table with this common column
        if index == 0:
            createTableQuery = f'CREATE TABLE {newTableName} ({row.varname_new} INTEGER);'
            continue

        ### If there is an imputation variable, add the column to the table
        if (isinstance(row.imputationvar, str) == True) and (row.imputationvar != 'None' and row.imputationvar[0].upper() == 'X'):
            ###
            ### Create the ADD COLUMN line for the imputed flag column to be added to the ALTER TABLE query
            ### 
            addImputed = f'  ADD COLUMN {row.varname_new}_imp ENUM({IMPUTATIONVALUES}) COMMENT "{row.varname}|{row.varTitle}",'
        ###
        ### Determine the data type to use for the field based on the metadata
        ###
        dtype = ''
        if tableData[row.varname.upper()].dtype == 'float':
            dtype = 'float'
        dataType = dtype_by_format(row.DataType, dtype, row.Fieldwidth)
        ###
        ### Create the ADD COLUMN line to be added to the ALTER TABLE query
        ###    
        addColumn = f'    ADD COLUMN {row.varname_new} {dataType} COMMENT "{row.varname}|{row.varTitle}",'
        ###
        ### Build the ALTER TABLE query for the current column
        ###
        addToQuery = ''
        if addImputed != None:
            addToQuery = addImputed + '\n'
        if addColumn != None:
            addToQuery += addColumn + '\n'
        alterTableQuery += addToQuery

        ### Continue for the next row in the table dictionary file
        continue
    ###
    ### Remove the last comma and space, add a semicolon
    ###
    alterTableQuery = alterTableQuery[:-2] + ';'

    ###
    ### Write the DROP, CREATE, ALTER TABLE queries to a SQL script file
    ###
    dropTableQuery = f'DROP TABLE IF EXISTS {newTableName};'
    write_sql_file(f'{newTableName}.1.drop table', dropTableQuery)
    write_sql_file(f'{newTableName}.2.create table', createTableQuery)
    write_sql_file(f'{newTableName}.3.add columns', alterTableQuery)

    ###
    ### Execute the DROP, CREATE, and ALTER the table SQL queries
    ###
    execute_dbquery(query=dropTableQuery, db=db, cursor=cursor)
    execute_dbquery(query=createTableQuery, db=db, cursor=cursor)
    execute_dbquery(query=alterTableQuery, db=db, cursor=cursor)

    loopCount += 1
    if loopCount > 2:
        break


### Upload CSV files into database tables

In [None]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
engine = create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    ### Create a dictionary of old column names and new column names
    columnNames = dict(zip(tableDictionary['varname'], tableDictionary['varname_new']))
    impColumnNames = tableDictionary[tableDictionary.imputationvar.str.len() > 2][['imputationvar', 'varname_new']]
    impColumnNames.varname_new = impColumnNames.varname_new + '_imp'
    impColumnNames = dict(zip(impColumnNames['imputationvar'], impColumnNames['varname_new']))
    tableData = pd.read_csv(f'data/{oldTableName.lower()}.csv', encoding='latin1', na_values=['.', '. ', ' '])
    ### Rename the columns  
    tableData = tableData.rename(columns=columnNames)
    tableData = tableData.rename(columns=impColumnNames)
    ### Write data to database
    tableData.to_sql(name=newTableName, con=engine, if_exists='append', index=False)

    loopCount += 1
    if loopCount > 2:
        break


### Set PK constraints

In [None]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
engine = create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    ### Create a list of columns with a PK designation
    pkColumns = list(tableDictionary[tableDictionary['imputationvar'] == 'PK']['varname_new'])
    addPKQuery = f'ALTER TABLE {newTableName}\n    ADD PRIMARY KEY ({", ".join(pkColumns)});'

    ###
    ### Write the ALTER TABLE query to a SQL script file
    ###
    write_sql_file(f'{newTableName}.4.add PK', addPKQuery)

    ###
    ### Execute the ALTER TABLE SQL query
    ###
    execute_dbquery(query=addPKQuery, db=db, cursor=cursor)

    loopCount += 1
    if loopCount > 2:
        break

### Create reference tables for discrete table columns

In [None]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
engine = create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Skip the IC2021_CAMPUSES table, as it uses reference tables from other tables
    if oldTableName == 'IC2021_CAMPUSES':
        continue

    ### Read the reference values for the table
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist')
    tableData = pd.read_csv(f'data/{oldTableName.lower()}.csv', nrows=100, encoding="utf-8", na_values=['.', '. ', ' '])
    ### Filter tableDictionary to only the rows format is equal to Disc, select only varname and varname_new columns
    tableDictionary = tableDictionary[tableDictionary.format == 'Disc'].sort_values(by=['varname'])

    allRefValues = dbRefValues[dbRefValues.TableName == oldTableName]
    allRefValues = allRefValues.sort_values(by=['varName', 'valueOrder'])

    for _, row in tableDictionary.iterrows():
        # Print which colomn are we working on
        # print(f'{row.DataType}, {row.Fieldwidth}, {row.varname}, {row.varname_new}')

        # filter allRefValues to only the rows where varName is equal to the current row varname
        refValues = allRefValues[allRefValues.varName == row.varname]
        ## if refValues is empty, continue to the next row
        if refValues.empty:
            # print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
            print(f'No refValues for {row.varname}, {row.varname_new}')
            continue
        
        ### Create DROP TABLE query
        dropRefTableQuery = f'DROP TABLE IF EXISTS institution_xref_{row.varname_new};'
        ### CREATE CREATE TABLE query
        # Determine the data type of the column
        dtype = ''
        if tableData[row.varname.upper()].dtype == 'float':
            dtype = 'float'
        dataType = dtype_by_format(row.DataType, dtype, row.Fieldwidth)
        # Build the CREATE TABLE query text
        createRefTableQuery = f'''
            CREATE TABLE institution_xref_{row.varname_new} (
                Codevalue {dataType},
                valueLabel VARCHAR(255),
                valueOrder SMALLINT UNSIGNED,
                PRIMARY KEY (Codevalue)
            );'''
        ###
        ### Write the DROP, CREATE, ALTER TABLE queries to a SQL script file
        ###
        write_sql_file(f'institution_xref_{row.varname_new}.1.drop table', dropRefTableQuery)
        write_sql_file(f'institution_xref_{row.varname_new}.2.create table', createRefTableQuery)
        ###
        ### Execute the DROP, CREATE and INSERT queries
        ###
        execute_dbquery(query=dropRefTableQuery, db=db, cursor=cursor)
        execute_dbquery(query=createRefTableQuery, db=db, cursor=cursor)
        refValues = refValues[['Codevalue', 'valueLabel', 'valueOrder']]
        refValues.to_sql(name=f'institution_xref_{row.varname_new}', con=engine, if_exists='append', index=False)   

    loopCount += 1
    if loopCount > 2:
        break

### Create FK contraints

In [157]:
### Connect to database and create cursor
db, cursor = create_dbconnection()
engine = create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
###
### Create a list of child tables to apply foreign key constraints
###
dbDictionary = dbDictionary[dbDictionary.TIM7020TableName != 'institution']

loopCount = 0

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))
### 
### For each table, read the dictionary file and build the ALTER TABLE queries 
###     to create the foreign keys constraints to the 'institution' parent table or
###     the 'institution_xref_{column name}' reference table
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist')
    tableDictionary = tableDictionary[tableDictionary.format == 'Disc']

    alterTableQuery = f'''
        ALTER TABLE {newTableName} 
        ADD CONSTRAINT fk_to_institution
            FOREIGN KEY (inst_id) REFERENCES institution(inst_id)'''
    
    for _, row in tableDictionary.iterrows():
        alterTableQuery += f''', 
        ADD CONSTRAINT fk__xref_{row['varname_new']}
            FOREIGN KEY ({row['varname_new']}) REFERENCES institution_xref_{newTableName}(Codevalue)'''

    alterTableQuery += ';'
    ###
    ### Write the ALTER Table FK query to a file
    ###
    write_sql_file(f'{newTableName}.5.add FK', alterTableQuery)
    ###
    ### Execute the ALTER Table FK query
    ###
    execute_dbquery(query=alterTableQuery, db=db, cursor=cursor)

    loopCount += 1
    if loopCount > 1:
        break

oldTableName: FLAGS2021, newTableName: institution_ic_response_flags
Failed creating database with query: 
        ALTER TABLE institution_ic_response_flags 
        ADD CONSTRAINT fk__to_institution
            FOREIGN KEY (inst_id) REFERENCES institution(inst_id), 
        ADD CONSTRAINT fk__to_institution_xref_ic_status
            FOREIGN KEY (ic_status) REFERENCES institution_xref_institution_ic_response_flags(Codevalue), 
        ADD CONSTRAINT fk__to_institution_xref_ic_migration_status
            FOREIGN KEY (ic_migration_status) REFERENCES institution_xref_institution_ic_response_flags(Codevalue), 
        ADD CONSTRAINT fk__to_institution_xref_ic_imputation_method
            FOREIGN KEY (ic_imputation_method) REFERENCES institution_xref_institution_ic_response_flags(Codevalue), 
        ADD CONSTRAINT fk__to_institution_xref_completions_status
            FOREIGN KEY (completions_status) REFERENCES institution_xref_institution_ic_response_flags(Codevalue), 
        ADD CONS