## Week 7 - Institution to Congress Database creation

[GitHub Repo Link](https://github.com/davidatorres/TIM7020/tree/main/Week7)

### Perform normal library imports

In [12]:
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

from dotenv import load_dotenv
load_dotenv()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import glob

import openai
openai.api_key = os.getenv("OPENAI_API_KEY")

import mysql.connector as mysql
import pymysql
from sqlalchemy import create_engine
host=os.getenv('MYSQL_SERVER')
database=os.getenv('MYSQL_DATABASE')
user=os.getenv('MYSQL_USER')
password=os.getenv('MYSQL_PASSWORD')

import internal_functions as fn

SCHEMA = 'week6'
IMPUTATIONVALUES = "'A','B','C','D','G','H','J','K','L','N','P','R','Y','Z',''"

### Remove and Create the generated /sql directory

In [13]:
directory = "sql"

# Remove the directory if it exists
if os.path.exists(directory):
    shutil.rmtree(directory)

# Create the directory
os.mkdir(directory)

### DROP & CREATE the database

In [14]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
dropDBQuery = f"DROP SCHEMA IF EXISTS {SCHEMA};"
createDBQuery = f"CREATE SCHEMA {SCHEMA};"
fn.execute_dbquery(query=dropDBQuery, db=db, cursor=cursor)
fn.execute_dbquery(query=createDBQuery, db=db, cursor=cursor)

### Create intutitive table names using OpenAI API and the column's description

In [15]:
### 
### Process the IPEDS data tables and create new column names using OpenAI API
###

### Read the dictionary files for the database
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))
### Get a list of the tables that have already been processed
processedTables = fn.return_list_of_processed_tables()
###
### For each table, generate the new variable names using OpenAI API
### Write the new variable names to a ".new." dictionary file, 
###  so we're not having to make repeated calls to the API
###
for oldTableName, newTableName in tableNames:
    if oldTableName.upper() not in processedTables:
        ### Record the table we are working on
        ##print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
        ### Read table's dictionary file
        tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.xlsx', sheet_name=1)
        # Create list of column descriptions to be used as input to the API
        varibleTitles = tableDictionary.iloc[:,6].tolist()
        # Print the number of variables in the list of column names
        print(f'varibleTitles (length): {len(varibleTitles)}')
        # Group the column names into groups of 30 to make the API calls
        groups = [varibleTitles[i:i+20] for i in range(0, len(varibleTitles), 20)]
        varibleTitlesNew = []
        for group in groups:
            varibleTitlesNew.extend(fn.create_name_from_description(group))
            time.sleep(5)
        # Print the number of variables in the list of new column names
        print(f'varibleTitlesNew (length): {len(varibleTitlesNew)}')
        # Append the new column names to the table's dictionary file
        tableDictionary['varname_new'] = varibleTitlesNew
        # Write the new dictionary file to disk
        tableDictionary.to_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist', index=False)


### Create a cross reference map between old and new tables and columns

In [16]:
###
### Read the dictionary file for the database
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbVarTable = pd.read_excel('@dictionary.xlsx', sheet_name='vartable21')
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))
### 
### For each table, read the dictionary file and build the Old Column Name to New Column Name Map 
###
tableMap = pd.DataFrame()
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    ### Create a dictionary of the old table and old column names to new table name and new column names
    for _, row in tableDictionary.iterrows():
        tableMap = tableMap.append({
            'oldTable' : oldTableName, 
            'oldVarname' : row.varname,
            'newTable' : newTableName,
            'newVarname' : row.varname_new
        }, ignore_index=True)

tableMap.to_excel('@mapTableColumn.xlsx', index=False)
tableMap.head()


Unnamed: 0,oldTable,oldVarname,newTable,newVarname
0,HD2021,UNITID,institution,inst_id
1,HD2021,INSTNM,institution,inst_name
2,HD2021,IALIAS,institution,inst_alias
3,HD2021,ADDR,institution,address
4,HD2021,CITY,institution,city


### Build CREATE and ALTER TABLE queries for the database

In [6]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
###
### Read the dictionary file for the database
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    # Create list of column descriptions less the first row (IPEDS ID)
    variableNames = tableDictionary['varname_new'].tolist()
    ### Read first 100 rows of the data file to verify data types
    tableData = pd.read_csv(f'data/{oldTableName}.csv', nrows=100, encoding="utf-8", na_values=['.', '. ', ' '])
    ### Create string for the query
    createTableQuery = None
    alterTableQuery = f'ALTER TABLE {newTableName} \n'
    ###
    ### Loop through the rows of the table dictionary file
    ###
    for index, row in tableDictionary.iterrows():
        ### Setup the column creation variables
        addColumn = None
        addImputed = None
        addCheck = None
        dataType = None
        ### Skip the first row (inst_id), we'll create the table with this common column
        if index == 0:
            createTableQuery = f'CREATE TABLE {newTableName} ({row.varname_new} INTEGER);'
            continue

        ### If there is an imputation variable, add the column to the table
        if (isinstance(row.imputationvar, str) == True) and (row.imputationvar != 'None' and row.imputationvar[0].upper() == 'X'):
            ###
            ### Create the ADD COLUMN line for the imputed flag column to be added to the ALTER TABLE query
            ### 
            addImputed = f'  ADD COLUMN {row.varname_new}_imp ENUM({IMPUTATIONVALUES}) COMMENT "{row.varname}|{row.varTitle}",'
        ###
        ### Determine the data type to use for the field based on the metadata
        ###
        dtype = ''
        if tableData[row.varname.upper()].dtype == 'float':
            dtype = 'float'
        dataType = fn.dtype_by_format(row.DataType, dtype, row.Fieldwidth)
        ###
        ### Create the ADD COLUMN line to be added to the ALTER TABLE query
        ###    
        addColumn = f'    ADD COLUMN {row.varname_new} {dataType} COMMENT "{row.varname}|{row.varTitle}",'
        ###
        ### Build the ALTER TABLE query for the current column
        ###
        addToQuery = ''
        if addImputed != None:
            addToQuery = addImputed + '\n'
        if addColumn != None:
            addToQuery += addColumn + '\n'
        alterTableQuery += addToQuery

        ### Continue for the next row in the table dictionary file
        continue
    ###
    ### Remove the last comma and space, add a semicolon
    ###
    alterTableQuery = alterTableQuery[:-2] + ';'

    ###
    ### Write the DROP, CREATE, ALTER TABLE queries to a SQL script file
    ###
    dropTableQuery = f'DROP TABLE IF EXISTS {newTableName};'
    fn.write_sql_file(f'{newTableName}.1.drop table', dropTableQuery)
    fn.write_sql_file(f'{newTableName}.2.create table', createTableQuery)
    fn.write_sql_file(f'{newTableName}.3.add columns', alterTableQuery)

    ###
    ### Execute the DROP, CREATE, and ALTER the table SQL queries
    ###
    fn.execute_dbquery(query=dropTableQuery, db=db, cursor=cursor)
    fn.execute_dbquery(query=createTableQuery, db=db, cursor=cursor)
    fn.execute_dbquery(query=alterTableQuery, db=db, cursor=cursor)

Failed creating database with query: DROP TABLE IF EXISTS institution; - Error: 3730 (HY000): Cannot drop table 'institution' referenced by a foreign key constraint 'institution_adm_consideration_ibfk_1' on table 'institution_adm_consideration'.
Failed creating database with query: CREATE TABLE institution (inst_id INTEGER); - Error: 1050 (42S01): Table 'institution' already exists
Failed creating database with query: ALTER TABLE institution 
    ADD COLUMN inst_name VARCHAR(120) COMMENT "INSTNM|Institution (entity) name",
    ADD COLUMN inst_alias VARCHAR(2000) COMMENT "IALIAS|Institution name alias",
    ADD COLUMN address VARCHAR(100) COMMENT "ADDR|Street address or post office box",
    ADD COLUMN city VARCHAR(30) COMMENT "CITY|City location of institution",
    ADD COLUMN state VARCHAR(2) COMMENT "STABBR|State abbreviation",
    ADD COLUMN zip_code VARCHAR(10) COMMENT "ZIP|ZIP code",
    ADD COLUMN fips_state_code TINYINT COMMENT "FIPS|FIPS state code",
    ADD COLUMN bea_regions 

### Upload CSV files into database tables

In [7]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
engine = fn.create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    ### Create a dictionary of old column names and new column names
    columnNames = dict(zip(tableDictionary['varname'], tableDictionary['varname_new']))
    impColumnNames = tableDictionary[tableDictionary.imputationvar.str.len() > 2][['imputationvar', 'varname_new']]
    impColumnNames.varname_new = impColumnNames.varname_new + '_imp'
    impColumnNames = dict(zip(impColumnNames['imputationvar'], impColumnNames['varname_new']))
    tableData = pd.read_csv(f'data/{oldTableName.lower()}.csv', encoding='latin1', na_values=['.', '. ', ' '])
    ### Rename the columns  
    tableData = tableData.rename(columns=columnNames)
    tableData = tableData.rename(columns=impColumnNames)
    ### Write data to database
    tableData.to_sql(name=newTableName, con=engine, if_exists='append', index=False)

IntegrityError: (pymysql.err.IntegrityError) (1062, "Duplicate entry '100654' for key 'institution.PRIMARY'")
[SQL: INSERT INTO institution (inst_id, inst_name, inst_alias, address, city, state, zip_code, fips_state_code, bea_regions, chief_admin_name, chief_admin_title, phone_number, employer_id, dun_bradstreet_numbers, ope_id, ope_eligibility_code, inst_url, admissions_url, financial_aid_url, online_app_url, net_price_calculator_url, veterans_tuition_url, athlete_graduation_url, disability_url, sector, level, control, highest_level, undergraduate_offering, graduate_offering, highest_degree_offered, degree_granting_status, hbcu, has_hospital, grants_medical_degree, tribal_college, urbanization_degree, open_to_public, institution_status, unitid_merged_schools, year_deleted_ipeds, date_closed, active_in_current_year, postsecondary_indicator, title_iv_indicator, postsecondary_title_iv_indicator, rpt_charges_grad_retention_fin_aid, institutional_category, carnegie_classification_2021_basic, carnegie_classification_2021_ugrad_pgm, carnegie_classification_2021_grad_pgm, carnegie_classification_2021_ugrad_profile, carnegie_classification_2021_enrollment_prf, carnegie_classification_2021_size_setting, carnegie_classification_2018_basic, carnegie_classification_2015_basic, carnegie_classification_2005_2010_basic, carnegie_classification_2000, land_grant_institution, institution_size_cat, multi_inst_org, multi_inst_name, multi_inst_id, cbsa, cbsa_type, csa, necta, fips_county_code, county_name, congressional_district_id, longitude, latitude, nces_comparison_group, custom_comparison_group) VALUES (%(inst_id)s, %(inst_name)s, %(inst_alias)s, %(address)s, %(city)s, %(state)s, %(zip_code)s, %(fips_state_code)s, %(bea_regions)s, %(chief_admin_name)s, %(chief_admin_title)s, %(phone_number)s, %(employer_id)s, %(dun_bradstreet_numbers)s, %(ope_id)s, %(ope_eligibility_code)s, %(inst_url)s, %(admissions_url)s, %(financial_aid_url)s, %(online_app_url)s, %(net_price_calculator_url)s, %(veterans_tuition_url)s, %(athlete_graduation_url)s, %(disability_url)s, %(sector)s, %(level)s, %(control)s, %(highest_level)s, %(undergraduate_offering)s, %(graduate_offering)s, %(highest_degree_offered)s, %(degree_granting_status)s, %(hbcu)s, %(has_hospital)s, %(grants_medical_degree)s, %(tribal_college)s, %(urbanization_degree)s, %(open_to_public)s, %(institution_status)s, %(unitid_merged_schools)s, %(year_deleted_ipeds)s, %(date_closed)s, %(active_in_current_year)s, %(postsecondary_indicator)s, %(title_iv_indicator)s, %(postsecondary_title_iv_indicator)s, %(rpt_charges_grad_retention_fin_aid)s, %(institutional_category)s, %(carnegie_classification_2021_basic)s, %(carnegie_classification_2021_ugrad_pgm)s, %(carnegie_classification_2021_grad_pgm)s, %(carnegie_classification_2021_ugrad_profile)s, %(carnegie_classification_2021_enrollment_prf)s, %(carnegie_classification_2021_size_setting)s, %(carnegie_classification_2018_basic)s, %(carnegie_classification_2015_basic)s, %(carnegie_classification_2005_2010_basic)s, %(carnegie_classification_2000)s, %(land_grant_institution)s, %(institution_size_cat)s, %(multi_inst_org)s, %(multi_inst_name)s, %(multi_inst_id)s, %(cbsa)s, %(cbsa_type)s, %(csa)s, %(necta)s, %(fips_county_code)s, %(county_name)s, %(congressional_district_id)s, %(longitude)s, %(latitude)s, %(nces_comparison_group)s, %(custom_comparison_group)s)]
[parameters: ({'inst_id': 100654, 'inst_name': 'Alabama A & M University', 'inst_alias': 'AAMU', 'address': '4900 Meridian Street', 'city': 'Normal', 'state': 'AL', 'zip_code': '35762', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Dr. Andrew Hugine, Jr.', 'chief_admin_title': 'President', 'phone_number': 2563725000.0, 'employer_id': 636001109, 'dun_bradstreet_numbers': '197216455', 'ope_id': 100200, 'ope_eligibility_code': 1, 'inst_url': 'www.aamu.edu/', 'admissions_url': 'https://www.aamu.edu/admissions-aid/index.html', 'financial_aid_url': 'https://www.aamu.edu/admissions-aid/financial-aid/', 'online_app_url': 'https://www.aamu.edu/admissions-aid/undergraduate-admissions/apply-today.html', 'net_price_calculator_url': 'www.aamu.edu/admissions-aid/tuition-fees/net-price-calculator.html', 'veterans_tuition_url': None, 'athlete_graduation_url': None, 'disability_url': 'https://www.aamu.edu/administrativeoffices/VADS/Pages/Disability-Services.aspx', 'sector': 1, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 12, 'degree_granting_status': 1, 'hbcu': 1, 'has_hospital': 2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 12, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 18, 'carnegie_classification_2021_ugrad_pgm': 16, 'carnegie_classification_2021_grad_pgm': 18, 'carnegie_classification_2021_ugrad_profile': 10, 'carnegie_classification_2021_enrollment_prf': 4, 'carnegie_classification_2021_size_setting': 14, 'carnegie_classification_2018_basic': 18, 'carnegie_classification_2015_basic': 18, 'carnegie_classification_2005_2010_basic': 18, 'carnegie_classification_2000': 16, 'land_grant_institution': 1, 'institution_size_cat': 3, 'multi_inst_org': 2, 'multi_inst_name': '-2', 'multi_inst_id': -2, 'cbsa': 26620, 'cbsa_type': 1, 'csa': 290, 'necta': -2, 'fips_county_code': 1089, 'county_name': 'Madison County', 'congressional_district_id': 105, 'longitude': -86.568502, 'latitude': 34.783368, 'nces_comparison_group': 109, 'custom_comparison_group': 1}, {'inst_id': 100663, 'inst_name': 'University of Alabama at Birmingham', 'inst_alias': 'UAB', 'address': 'Administration Bldg Suite 1070', 'city': 'Birmingham', 'state': 'AL', 'zip_code': '35294-0110', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Ray L. Watts', 'chief_admin_title': 'President', 'phone_number': 2059344011.0, 'employer_id': 636005396, 'dun_bradstreet_numbers': '63690705', 'ope_id': 105200, 'ope_eligibility_code': 1, 'inst_url': 'https://www.uab.edu/', 'admissions_url': 'https://www.uab.edu/admissions/', 'financial_aid_url': 'https://www.uab.edu/cost-aid/', 'online_app_url': 'https://www.uab.edu/admissions/apply', 'net_price_calculator_url': 'https://tcc.ruffalonl.com/University of Alabama at Birmingham/Freshman-Students', 'veterans_tuition_url': 'https://www.uab.edu/students/veterans', 'athlete_graduation_url': 'https://www.uab.edu/registrar/students', 'disability_url': 'https://www.uab.edu/students/disability/', 'sector': 1, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 11, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': 1, 'grants_medical_degree': 1, 'tribal_college': 2, 'urbanization_degree': 12, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 15, 'carnegie_classification_2021_ugrad_pgm': 14, 'carnegie_classification_2021_grad_pgm': 14, 'carnegie_classification_2021_ugrad_profile': 9, 'carnegie_classification_2021_enrollment_prf': 5, 'carnegie_classification_2021_size_setting': 15, 'carnegie_classification_2018_basic': 15, 'carnegie_classification_2015_basic': 15, 'carnegie_classification_2005_2010_basic': 15, 'carnegie_classification_2000': 15, 'land_grant_institution': 2, 'institution_size_cat': 5, 'multi_inst_org': 1, 'multi_inst_name': 'The University of Alabama System', 'multi_inst_id': 101050, 'cbsa': 13820, 'cbsa_type': 1, 'csa': 142, 'necta': -2, 'fips_county_code': 1073, 'county_name': 'Jefferson County', 'congressional_district_id': 107, 'longitude': -86.799345, 'latitude': 33.505697, 'nces_comparison_group': 93, 'custom_comparison_group': 1}, {'inst_id': 100690, 'inst_name': 'Amridge University', 'inst_alias': 'Southern Christian University  Regions University', 'address': '1200 Taylor Rd', 'city': 'Montgomery', 'state': 'AL', 'zip_code': '36117-3553', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Michael C.Turner', 'chief_admin_title': 'President', 'phone_number': 33400000000000.0, 'employer_id': 237034324, 'dun_bradstreet_numbers': '126307792', 'ope_id': 2503400, 'ope_eligibility_code': 1, 'inst_url': 'www.amridgeuniversity.edu/', 'admissions_url': 'www.amridgeuniversity.edu/admissions/', 'financial_aid_url': 'www.amridgeuniversity.edu/financialaid/', 'online_app_url': 'https://www2.amridgeuniversity.edu/Amridge/Login.aspx', 'net_price_calculator_url': 'www2.amridgeuniversity.edu:9091/', 'veterans_tuition_url': 'www.amridgeuniversity.edu/admissions/military/', 'athlete_graduation_url': None, 'disability_url': 'www.amridgeuniversity.edu/academics/', 'sector': 2, 'level': 1, 'control': 2, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 12, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': 2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 12, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 20, 'carnegie_classification_2021_ugrad_pgm': 20, 'carnegie_classification_2021_grad_pgm': 18, 'carnegie_classification_2021_ugrad_profile': 5, 'carnegie_classification_2021_enrollment_prf': 6, 'carnegie_classification_2021_size_setting': 6, 'carnegie_classification_2018_basic': 20, 'carnegie_classification_2015_basic': 20, 'carnegie_classification_2005_2010_basic': 21, 'carnegie_classification_2000': 51, 'land_grant_institution': 2, 'institution_size_cat': 1, 'multi_inst_org': 2, 'multi_inst_name': '-2', 'multi_inst_id': -2, 'cbsa': 33860, 'cbsa_type': 1, 'csa': 388, 'necta': -2, 'fips_county_code': 1101, 'county_name': 'Montgomery County', 'congressional_district_id': 102, 'longitude': -86.17401, 'latitude': 32.362609, 'nces_comparison_group': 127, 'custom_comparison_group': 2}, {'inst_id': 100706, 'inst_name': 'University of Alabama in Huntsville', 'inst_alias': 'UAH  University of Alabama Huntsville', 'address': '301 Sparkman Dr', 'city': 'Huntsville', 'state': 'AL', 'zip_code': '35899', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Darren Dawson', 'chief_admin_title': 'President', 'phone_number': 2568246120.0, 'employer_id': 630520830, 'dun_bradstreet_numbers': '949687123', 'ope_id': 105500, 'ope_eligibility_code': 1, 'inst_url': 'www.uah.edu/', 'admissions_url': 'https://www.uah.edu/admissions', 'financial_aid_url': 'finaid.uah.edu/', 'online_app_url': 'register.uah.edu/', 'net_price_calculator_url': 'finaid.uah.edu/', 'veterans_tuition_url': 'www.uah.edu/admissions/graduate/financial-aid/veterans', 'athlete_graduation_url': 'www.uah.edu/heoa', 'disability_url': 'www.uah.edu/health-and-wellness/disability-support', 'sector': 1, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 11, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': 2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 12, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 15, 'carnegie_classification_2021_ugrad_pgm': 17, 'carnegie_classification_2021_grad_pgm': 17, 'carnegie_classification_2021_ugrad_profile': 15, 'carnegie_classification_2021_enrollment_prf': 4, 'carnegie_classification_2021_size_setting': 13, 'carnegie_classification_2018_basic': 16, 'carnegie_classification_2015_basic': 16, 'carnegie_classification_2005_2010_basic': 15, 'carnegie_classification_2000': 16, 'land_grant_institution': 2, 'institution_size_cat': 3, 'multi_inst_org': 1, 'multi_inst_name': 'The University of Alabama System', 'multi_inst_id': 101050, 'cbsa': 26620, 'cbsa_type': 1, 'csa': 290, 'necta': -2, 'fips_county_code': 1089, 'county_name': 'Madison County', 'congressional_district_id': 105, 'longitude': -86.640449, 'latitude': 34.724557, 'nces_comparison_group': 93, 'custom_comparison_group': 2}, {'inst_id': 100724, 'inst_name': 'Alabama State University', 'inst_alias': None, 'address': '915 S Jackson Street', 'city': 'Montgomery', 'state': 'AL', 'zip_code': '36104-0271', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Quinton T. Ross', 'chief_admin_title': 'President', 'phone_number': 3342294100.0, 'employer_id': 636001101, 'dun_bradstreet_numbers': '40672685', 'ope_id': 100500, 'ope_eligibility_code': 1, 'inst_url': 'www.alasu.edu/', 'admissions_url': 'www.alasu.edu/admissions/index.aspx', 'financial_aid_url': 'www.alasu.edu/undergraduate/expensesandfinancialaid/expenses-and-financial-aid', 'online_app_url': 'www.alasu.edu/admissions/undergrad-admissions/index.aspx', 'net_price_calculator_url': 'www.alasu.edu/cost-aid/tuition-costs/net-price-calculator', 'veterans_tuition_url': None, 'athlete_graduation_url': 'www.alasu.edu/cost-aid/consumer-info', 'disability_url': 'www.alasu.edu/academics/researchcenters/alabama-alliance-students-disabilities', 'sector': 1, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 11, 'degree_granting_status': 1, 'hbcu': 1, 'has_hospital': 2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 12, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 17, 'carnegie_classification_2021_ugrad_pgm': 13, 'carnegie_classification_2021_grad_pgm': 18, 'carnegie_classification_2021_ugrad_profile': 10, 'carnegie_classification_2021_enrollment_prf': 3, 'carnegie_classification_2021_size_setting': 14, 'carnegie_classification_2018_basic': 19, 'carnegie_classification_2015_basic': 19, 'carnegie_classification_2005_2010_basic': 18, 'carnegie_classification_2000': 21, 'land_grant_institution': 2, 'institution_size_cat': 2, 'multi_inst_org': 2, 'multi_inst_name': '-2', 'multi_inst_id': -2, 'cbsa': 33860, 'cbsa_type': 1, 'csa': 388, 'necta': -2, 'fips_county_code': 1101, 'county_name': 'Montgomery County', 'congressional_district_id': 107, 'longitude': -86.295677, 'latitude': 32.364317, 'nces_comparison_group': 99, 'custom_comparison_group': 1}, {'inst_id': 100733, 'inst_name': 'University of Alabama System Office', 'inst_alias': None, 'address': '500 University Blvd. East', 'city': 'Tuscaloosa', 'state': 'AL', 'zip_code': '35401', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Finis St. John IV', 'chief_admin_title': 'Chancellor', 'phone_number': 2053485122.0, 'employer_id': 636001138, 'dun_bradstreet_numbers': '808245794', 'ope_id': 800400, 'ope_eligibility_code': 2, 'inst_url': 'www.uasystem.edu/', 'admissions_url': None, 'financial_aid_url': None, 'online_app_url': None, 'net_price_calculator_url': 'www.uasystem.edu/', 'veterans_tuition_url': None, 'athlete_graduation_url': None, 'disability_url': 'www.uasystem.edu/', 'sector': 0, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 11, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': -2, 'grants_medical_degree': -2, 'tribal_college': 2, 'urbanization_degree': 13, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': -2, 'institutional_category': -2, 'carnegie_classification_2021_basic': -2, 'carnegie_classification_2021_ugrad_pgm': -2, 'carnegie_classification_2021_grad_pgm': -2, 'carnegie_classification_2021_ugrad_profile': -2, 'carnegie_classification_2021_enrollment_prf': -2, 'carnegie_classification_2021_size_setting': -2, 'carnegie_classification_2018_basic': -2, 'carnegie_classification_2015_basic': -2, 'carnegie_classification_2005_2010_basic': -2, 'carnegie_classification_2000': -2, 'land_grant_institution': 2, 'institution_size_cat': -2, 'multi_inst_org': 1, 'multi_inst_name': 'The University of Alabama System', 'multi_inst_id': 101050, 'cbsa': 46220, 'cbsa_type': 1, 'csa': -2, 'necta': -2, 'fips_county_code': 1125, 'county_name': 'Tuscaloosa County', 'congressional_district_id': 107, 'longitude': -87.529594, 'latitude': 33.207015, 'nces_comparison_group': -2, 'custom_comparison_group': -2}, {'inst_id': 100751, 'inst_name': 'The University of Alabama', 'inst_alias': None, 'address': '739 University Blvd', 'city': 'Tuscaloosa', 'state': 'AL', 'zip_code': '35487-0100', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Dr. Stuart R. Bell', 'chief_admin_title': 'President', 'phone_number': 2053486010.0, 'employer_id': 636001138, 'dun_bradstreet_numbers': '45632635', 'ope_id': 105100, 'ope_eligibility_code': 1, 'inst_url': 'www.ua.edu/', 'admissions_url': 'gobama.ua.edu/', 'financial_aid_url': 'financialaid.ua.edu/', 'online_app_url': 'apply.ua.edu/', 'net_price_calculator_url': 'financialaid.ua.edu/net-price-calculator/', 'veterans_tuition_url': 'https://vets.sa.ua.edu/', 'athlete_graduation_url': 'registrar.ua.edu/academics-policies/student-right-to-know/', 'disability_url': 'ods.ua.edu/', 'sector': 1, 'level': 1, 'control': 1, 'highest_level': 9, 'undergraduate_offering': 1, 'graduate_offering': 1, 'highest_degree_offered': 11, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': 2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 13, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 2, 'carnegie_classification_2021_basic': 15, 'carnegie_classification_2021_ugrad_pgm': 17, 'carnegie_classification_2021_grad_pgm': 15, 'carnegie_classification_2021_ugrad_profile': 12, 'carnegie_classification_2021_enrollment_prf': 4, 'carnegie_classification_2021_size_setting': 16, 'carnegie_classification_2018_basic': 15, 'carnegie_classification_2015_basic': 16, 'carnegie_classification_2005_2010_basic': 16, 'carnegie_classification_2000': 15, 'land_grant_institution': 2, 'institution_size_cat': 5, 'multi_inst_org': 1, 'multi_inst_name': 'The University of Alabama System', 'multi_inst_id': 101050, 'cbsa': 46220, 'cbsa_type': 1, 'csa': -2, 'necta': -2, 'fips_county_code': 1125, 'county_name': 'Tuscaloosa County', 'congressional_district_id': 107, 'longitude': -87.545978, 'latitude': 33.211875, 'nces_comparison_group': 92, 'custom_comparison_group': 1}, {'inst_id': 100760, 'inst_name': 'Central Alabama Community College', 'inst_alias': None, 'address': '1675 Cherokee Rd', 'city': 'Alexander City', 'state': 'AL', 'zip_code': '35010', 'fips_state_code': 1, 'bea_regions': 5, 'chief_admin_name': 'Jeff Lynn', 'chief_admin_title': 'President', 'phone_number': 2562346346.0, 'employer_id': 631022757, 'dun_bradstreet_numbers': '874447246', 'ope_id': 100700, 'ope_eligibility_code': 1, 'inst_url': 'www.cacc.edu/', 'admissions_url': 'https://www.cacc.edu/admissions/', 'financial_aid_url': 'https://www.cacc.edu/financial-aid/', 'online_app_url': 'https://www.cacc.edu/admissions/apply/', 'net_price_calculator_url': 'https://www.cacc.edu/net-price-calculator/', 'veterans_tuition_url': 'https://www.cacc.edu/financial-aid/veterans/', 'athlete_graduation_url': 'https://www.cacc.edu/about/consumer-information/', 'disability_url': 'https://www.cacc.edu/services/other-services/disability-services/', 'sector': 4, 'level': 2, 'control': 1, 'highest_level': 3, 'undergraduate_offering': 1, 'graduate_offering': 2, 'highest_degree_offered': 40, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': -2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 32, 'open_to_public': 1, 'institution_status': 'A ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 1, 'institutional_category': 4, 'carnegie_classification_2021_basic': 5, 'carnegie_classification_2021_ugrad_pgm': 2, 'carnegie_classification_2021_grad_pgm': 0, 'carnegie_classification_2021_ugrad_profile': 1, 'carnegie_classification_2021_enrollment_prf': 1, 'carnegie_classification_2021_size_setting': 2, 'carnegie_classification_2018_basic': 2, 'carnegie_classification_2015_basic': 1, 'carnegie_classification_2005_2010_basic': 2, 'carnegie_classification_2000': 40, 'land_grant_institution': 2, 'institution_size_cat': 2, 'multi_inst_org': 1, 'multi_inst_name': 'Alabama Community College System', 'multi_inst_id': 101030, 'cbsa': 10760, 'cbsa_type': 2, 'csa': 388, 'necta': -2, 'fips_county_code': 1123, 'county_name': 'Tallapoosa County', 'congressional_district_id': 103, 'longitude': -85.945266, 'latitude': 32.92478, 'nces_comparison_group': 74, 'custom_comparison_group': 2}  ... displaying 10 of 6289 total bound parameter sets ...  {'inst_id': 497338, 'inst_name': 'Glendale Career College-North-West College-Bakersfield', 'inst_alias': None, 'address': '3000 Ming Avenue', 'city': 'Bakersfield', 'state': 'CA', 'zip_code': '93304-5075', 'fips_state_code': 6, 'bea_regions': 8, 'chief_admin_name': 'Mitchell Fuerst', 'chief_admin_title': 'President/CEO', 'phone_number': 6614047575.0, 'employer_id': 951729727, 'dun_bradstreet_numbers': '79249343', 'ope_id': 2338516, 'ope_eligibility_code': 1, 'inst_url': 'https://www.success.edu/', 'admissions_url': None, 'financial_aid_url': None, 'online_app_url': None, 'net_price_calculator_url': 'https://nw.edu/ecamp/npc-nwc/npcalc.htm', 'veterans_tuition_url': None, 'athlete_graduation_url': 'https://glendalecareercollege.com/', 'disability_url': 'https://glendalecareercollege.com/', 'sector': 6, 'level': 2, 'control': 3, 'highest_level': 3, 'undergraduate_offering': 1, 'graduate_offering': 2, 'highest_degree_offered': 40, 'degree_granting_status': 1, 'hbcu': 2, 'has_hospital': -2, 'grants_medical_degree': 2, 'tribal_college': 2, 'urbanization_degree': 11, 'open_to_public': 1, 'institution_status': 'N ', 'unitid_merged_schools': -2, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 1, 'postsecondary_indicator': 1, 'title_iv_indicator': 1, 'postsecondary_title_iv_indicator': 1, 'rpt_charges_grad_retention_fin_aid': 2, 'institutional_category': 4, 'carnegie_classification_2021_basic': -2, 'carnegie_classification_2021_ugrad_pgm': -2, 'carnegie_classification_2021_grad_pgm': -2, 'carnegie_classification_2021_ugrad_profile': -2, 'carnegie_classification_2021_enrollment_prf': -2, 'carnegie_classification_2021_size_setting': -2, 'carnegie_classification_2018_basic': -2, 'carnegie_classification_2015_basic': -2, 'carnegie_classification_2005_2010_basic': -2, 'carnegie_classification_2000': -2, 'land_grant_institution': 2, 'institution_size_cat': 1, 'multi_inst_org': 1, 'multi_inst_name': 'North-West College', 'multi_inst_id': 303480, 'cbsa': 12540, 'cbsa_type': 1, 'csa': -2, 'necta': -2, 'fips_county_code': 6029, 'county_name': 'Kern County', 'congressional_district_id': 623, 'longitude': -119.035082, 'latitude': 35.339951, 'nces_comparison_group': 221, 'custom_comparison_group': 2}, {'inst_id': 497347, 'inst_name': 'University of Maine - Machias', 'inst_alias': None, 'address': "116 O'Brien Avenue", 'city': 'Machias', 'state': 'ME', 'zip_code': '04654-1397', 'fips_state_code': 23, 'bea_regions': 1, 'chief_admin_name': None, 'chief_admin_title': None, 'phone_number': None, 'employer_id': -1, 'dun_bradstreet_numbers': None, 'ope_id': 205306, 'ope_eligibility_code': 1, 'inst_url': None, 'admissions_url': None, 'financial_aid_url': None, 'online_app_url': None, 'net_price_calculator_url': None, 'veterans_tuition_url': None, 'athlete_graduation_url': None, 'disability_url': None, 'sector': 99, 'level': -3, 'control': -3, 'highest_level': -3, 'undergraduate_offering': -3, 'graduate_offering': -3, 'highest_degree_offered': -3, 'degree_granting_status': -3, 'hbcu': 2, 'has_hospital': -2, 'grants_medical_degree': -2, 'tribal_college': 2, 'urbanization_degree': 43, 'open_to_public': 1, 'institution_status': 'G ', 'unitid_merged_schools': 161253, 'year_deleted_ipeds': -2, 'date_closed': '-2', 'active_in_current_year': 3, 'postsecondary_indicator': 1, 'title_iv_indicator': 3, 'postsecondary_title_iv_indicator': 9, 'rpt_charges_grad_retention_fin_aid': -2, 'institutional_category': -2, 'carnegie_classification_2021_basic': -2, 'carnegie_classification_2021_ugrad_pgm': -2, 'carnegie_classification_2021_grad_pgm': -2, 'carnegie_classification_2021_ugrad_profile': -2, 'carnegie_classification_2021_enrollment_prf': -2, 'carnegie_classification_2021_size_setting': -2, 'carnegie_classification_2018_basic': -2, 'carnegie_classification_2015_basic': -2, 'carnegie_classification_2005_2010_basic': -2, 'carnegie_classification_2000': -2, 'land_grant_institution': 2, 'institution_size_cat': -2, 'multi_inst_org': -2, 'multi_inst_name': '-2', 'multi_inst_id': -2, 'cbsa': -2, 'cbsa_type': -2, 'csa': -2, 'necta': -2, 'fips_county_code': 23029, 'county_name': 'Washington County', 'congressional_district_id': 2302, 'longitude': -67.458025, 'latitude': 44.708803, 'nces_comparison_group': -2, 'custom_comparison_group': -2})]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

### Set PK constraints

In [None]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
engine = fn.create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    ### Read table's dictionary file
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName}.new.xlsx', sheet_name='varlist')
    ### Create a list of columns with a PK designation
    pkColumns = list(tableDictionary[tableDictionary['imputationvar'] == 'PK']['varname_new'])
    addPKQuery = f'ALTER TABLE {newTableName}\n    ADD PRIMARY KEY ({", ".join(pkColumns)});'

    ###
    ### Write the ALTER TABLE query to a SQL script file
    ###
    fn.write_sql_file(f'{newTableName}.4.add PK', addPKQuery)

    ###
    ### Execute the ALTER TABLE SQL query
    ###
    fn.execute_dbquery(query=addPKQuery, db=db, cursor=cursor)


Failed creating database with query: ALTER TABLE institution_comp_distance
    ADD PRIMARY KEY (inst_id, cip_code); - Error: 1062 (23000): Duplicate entry '100663-41' for key 'institution_comp_distance.PRIMARY'
Failed creating database with query: ALTER TABLE institution_hr_salaries_gender_rank
    ADD PRIMARY KEY (inst_id); - Error: 1062 (23000): Duplicate entry '100654' for key 'institution_hr_salaries_gender_rank.PRIMARY'


### Create reference tables for discrete table columns

In [None]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
engine = fn.create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
loopCount = 0
### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))

### 
### For each table, read the dictionary file and build the CREATE TABLE/ALTER TABLE queries 
### to build the IPEDS Postsecondary database using the IPEDS dictionary and data files
### The novel opportunity here is to use the dictory file to determine the data type of the columns
###   create new human readable column names and added the column description as column comments 
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')

    ### IF olTableName is equal to IC2021_CAMPUSES, continue to the next table (reusing reference values)
    if oldTableName == 'IC2021_CAMPUSES':
        continue

    ### Read the reference values for the table
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist')
    tableData = pd.read_csv(f'data/{oldTableName.lower()}.csv', nrows=100, encoding="utf-8", na_values=['.', '. ', ' '])
    ### Filter tableDictionary to only the rows format is equal to Disc, select only varname and varname_new columns
    tableDictionary = tableDictionary[tableDictionary.format == 'Disc'].sort_values(by=['varname'])

    allRefValues = dbRefValues[dbRefValues.TableName == oldTableName]
    allRefValues = allRefValues.sort_values(by=['varName', 'valueOrder'])

    for _, row in tableDictionary.iterrows():
        # Print which colomn are we working on
        # print(f'{row.DataType}, {row.Fieldwidth}, {row.varname}, {row.varname_new}')

        # filter allRefValues to only the rows where varName is equal to the current row varname
        refValues = allRefValues[allRefValues.varName == row.varname]
        ## if refValues is empty, continue to the next row
        if refValues.empty:
            # #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
            print(f'No refValues for {row.varname}, {row.varname_new}')
            continue
        
        ### Create DROP TABLE query
        dropRefTableQuery = f'DROP TABLE IF EXISTS institution_xref_{row.varname_new};'
        ### CREATE CREATE TABLE query
        # Determine the data type of the column
        dtype = ''
        if tableData[row.varname.upper()].dtype == 'float':
            dtype = 'float'
        dataType = fn.dtype_by_format(row.DataType, dtype, row.Fieldwidth)
        # Build the CREATE TABLE query text
        createRefTableQuery = f'''
            CREATE TABLE institution_xref_{row.varname_new} (
                Codevalue {dataType},
                valueLabel VARCHAR(255),
                valueOrder MEDIUMINT UNSIGNED,
                PRIMARY KEY (Codevalue)
            );'''
        ###
        ### Write the DROP, CREATE, ALTER TABLE queries to a SQL script file
        ###
        fn.write_sql_file(f'institution_xref_{row.varname_new}.1.drop table', dropRefTableQuery)
        fn.write_sql_file(f'institution_xref_{row.varname_new}.2.create table', createRefTableQuery)
        ###
        ### Execute the DROP, CREATE and INSERT queries
        ###
        fn.execute_dbquery(query=dropRefTableQuery, db=db, cursor=cursor)
        fn.execute_dbquery(query=createRefTableQuery, db=db, cursor=cursor)
        refValues = refValues[['Codevalue', 'valueLabel', 'valueOrder']]
        refValues.to_sql(name=f'institution_xref_{row.varname_new}', con=engine, if_exists='append', index=False)   


### Ensure that campus fips county codes and cbsa codes are in xref tables.

In [None]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()

insertQuery = '''
    insert into tim7020_2.institution_xref_fips_county_code (CodeValue, valueLabel)
        select distinct fips_county_code, county_name
            from tim7020_2.institution_campus 
            where fips_county_code not in (
                select Codevalue from tim7020_2.institution_xref_fips_county_code
            );'''
fn.execute_dbquery(query=insertQuery, db=db, cursor=cursor)

insertQuery = '''
    insert into tim7020_2.institution_xref_cbsa (Codevalue, valueLabel)
        select distinct cbsa as Codevalue, concat(city, ', ', state) as valueLabel
            from tim7020_2.institution_campus 
            where cbsa not in (
                select Codevalue from tim7020_2.institution_xref_cbsa
                );'''
fn.execute_dbquery(query=insertQuery, db=db, cursor=cursor)

### Create FK and CHECK contraints

In [None]:
### Connect to database and create cursor
db, cursor = fn.create_dbconnection()
engine = fn.create_dbengine()
###
### Read the dictionary file for the databasea
###
dbDictionary = pd.read_excel('@dictionary.xlsx', sheet_name='Tables21')
dbRefValues = pd.read_excel('@dictionary.xlsx', sheet_name='valuesets21')
###
### Create a list of child tables to apply foreign key constraints
###
### dbDictionary = dbDictionary[dbDictionary.TIM7020TableName != 'institution']

loopCount = 0

### Create a list of tuples with the table names and the data files
tableNames = list(zip(dbDictionary.TableName, dbDictionary.TIM7020TableName))
### 
### For each table, read the dictionary file and build the ALTER TABLE queries 
###     to create the foreign keys constraints to the 'institution' parent table or
###     the 'institution_xref_{column name}' reference table
###
for oldTableName, newTableName in tableNames:
    ### Record the table we are working on
    #print(f'oldTableName: {oldTableName}, newTableName: {newTableName}')
    tableDictionary = pd.read_excel(f'dictionary/{oldTableName.lower()}.new.xlsx', sheet_name='varlist')
    tableDictionary = tableDictionary[tableDictionary.format == 'Disc']

    alterTableQuery = ''

    if newTableName != 'institution':
        alterTableQuery = f'''
            ALTER TABLE {newTableName} 
                ADD FOREIGN KEY (inst_id) REFERENCES institution(inst_id)'''
        
    for _, row in tableDictionary.iterrows():
        varname = row.varname
        if oldTableName == 'IC2021_CAMPUSES':
            varname = tableMap[tableMap.oldVarname == row.varname].oldVarname.values[0][2:]

        ### Filter dbRefValues for the current variable
        refValues = dbRefValues[dbRefValues.varName == varname]

        if len(alterTableQuery) != 0:
            alterTableQuery += f''',
                '''
        else:
            alterTableQuery += f'''ALTER TABLE {newTableName}
                '''
            
        if row['DataType'] == 'A' or len(refValues) > 7:
            alterTableQuery += f'ADD FOREIGN KEY ({row.varname_new}) REFERENCES institution_xref_{row.varname_new}(Codevalue)'
        else:
            alterTableQuery += f'ADD CHECK ({row.varname_new} IN ({", ".join(refValues.Codevalue.to_list())}))'

    alterTableQuery += ';'
    ###
    ### Write the ALTER Table FK query to a file
    ###
    fn.write_sql_file(f'{newTableName}.5.add FK-Check', alterTableQuery)
    ###
    ### Execute the ALTER Table FK query
    ###
    fn.execute_dbquery(query=alterTableQuery, db=db, cursor=cursor)


### Create Congress Tables and Relationship

In [None]:
###
### Create congressional tables in Week6 database
###
db, cursor = fn.create_dbconnection()
engine = fn.create_dbengine()
###
### Create tables using the exported SQL file @congress.sql
###  file was exported from MySQL Workbench
###  slight modifications were made to the file 
###  for the new institution primary key (inst_id)
###  and elimination of the us_states_territories table
###  and assigning the state column in the congress table to
###  the Codevalue column institution_xref_state table
###
with open('@congress.sql', 'r') as file:
    query = file.read()
###
### split into separate queries
###
query = query.replace(';', ';|')
query = query.replace('| ', '|')
query = query.replace('\n', '  ')
queries = query.split('|')
###
### execute each query
###
for query in queries:
    query = query.strip()
    query = query.replace('  ', ' ')
    query = query.replace('(  ', '(')
    query = query.replace(' )', ')')
    fn.execute_dbquery(query=query, db=db, cursor=cursor)

###
### Copy WEEK 5 congress data, save to Excel, and the tim7020 database
###

# df = pd.read_sql('SELECT * FROM mydatabase.chamber', con=db)
# df.to_excel('data/congress_chamber.xlsx', sheet_name='chamber', index=False)
df = pd.read_excel('data/congress_chamber.xlsx', sheet_name='chamber')
df.to_sql('congress_chamber', con=engine, if_exists='append', index=False)

# df = pd.read_sql('SELECT * FROM mydatabase.party', con=db)
# df.to_excel('data/congress_party.xlsx', sheet_name='party', index=False)
df = pd.read_excel('data/congress_party.xlsx', sheet_name='party')
df.to_sql('congress_party', con=engine, if_exists='append', index=False)

# df = pd.read_sql('SELECT * FROM mydatabase.congress', con=db)
# df.to_excel('data/congress.xlsx', sheet_name='congress', index=False)
df = pd.read_excel('data/congress.xlsx', sheet_name='congress')
df.to_sql('congress', con=engine, if_exists='append', index=False)

# df = pd.read_sql('SELECT * FROM mydatabase.congress_office', con=db)
# df.to_excel('data/congress_office.xlsx', sheet_name='congress_office', index=False)
df = pd.read_excel('data/congress_office.xlsx', sheet_name='congress_office')
df.to_sql('congress_office', con=engine, if_exists='append', index=False)

# df = pd.read_sql('SELECT * FROM mydatabase.institution_to_congress', con=db)
### Rename column to inst_id in Week 6  
# df.rename(columns={'ipeds_id': 'inst_id'}, inplace=True)
# df.to_excel('data/institution_to_congress.xlsx', sheet_name='institution_to_congress', index=False)
df = pd.read_excel('data/institution_to_congress.xlsx', sheet_name='institution_to_congress')
df.to_sql('institution_to_congress', con=engine, if_exists='append', index=False)


6286