In [1]:
import pandas as pd

import openai
import re

from textwrap import dedent

In [2]:
def choose_table(tables, question):
    newline ='\n'
    
    prompt = dedent(f'''List at most three tables, separated by '|', needed to answer each question. 
        You have the following SQL tables of US Census data to choose from:
        
        {(newline).join(tables)}
        
        Question: `Where do people have more dial-up internet than other kinds?`
        Answer: acs_internet_access
        
        Question: `Which areas have the most kids?`
        Answer: acs_sex_by_age
        
        Question: `Where has the most rich people and people on Medicare?`
        Answer: acs_ratio_of_income_to_poverty_level|acs_medicare
        
        Question: `areas with long commutes`
        Answer: acs_commute_times
        
        Question: `{question}`
        Answer:''')
    
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        temperature=0.9,
        max_tokens=1000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    
    tables = response.choices[0].text
    tables = re.split('\||\n', tables)
    tables = [s.strip() for s in tables]
    print(tables)
    return tables

def draft_query(query_tables, DDLs, question):
    prompt = dedent(f'''Convert text to SQL.
        
        You have the following tables and columns:
    ''')
    
    for i in range(len(query_tables)):
        prompt += dedent(f'''
        {query_tables[i]}: {DDLs[query_tables[i]]}
        ''')
        
    prompt += dedent(f'''
        Question: Which areas have the most kids?
        SQL: SELECT zcta, (num_male_under_5_years + num_male_5_to_9_years + num_male_10_to_14_years + num_male_15_to_17_years + num_female_under_5_years + num_female_5_to_9_years + num_female_10_to_14_years + num_female_15_to_17_years) / total_population AS fraction_kids FROM acs_sex_by_age WHERE total_population > 0 ORDER BY fraction_kids DESC
        
        Question: Which areas are mostly white?
        SQL: SELECT zcta, num_white_alone / total_population AS fraction_white FROM acs_race WHERE total_population > 0 ORDER BY fraction_white DESC
        
        Question: Show me the most Hispanic areas.
        SQL: SELECT zcta, num_hispanic_or_latino / total_population AS fraction_hispanic FROM acs_hispanic WHERE total_population > 0 ORDER BY fraction_hispanic DESC
        
        Question: {question}
        SQL:''')
        
    return prompt

def text_to_sql(tables, DDLs, question):
    while True:
        try:
            query_tables = choose_table(tables, question)

            prompt = draft_query(query_tables, DDLs, question)

            response = openai.Completion.create(
                engine="text-davinci-003",
                prompt=prompt,
                temperature=0.9,
                max_tokens=1000,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
            )

        except error.RateLimitError:
            print('RateLimitError')
            time.sleep(15)
            continue

        print(response['choices'][0]['text'])
        return response['choices'][0]['text']
    

In [17]:
tables = [
    "acs_sex_by_age",
    "acs_race",
    "acs_hispanic",
    "acs_commute_times",
    "acs_employment_by_industry",
    "acs_commute_modes",
    "acs_education_subjects",
    "acs_housing_year_built",
    "acs_poverty_status",
    "acs_ratio_of_income_to_poverty_level",
    "acs_earners_in_household",
    "acs_housing",
    "acs_medicare",
    "acs_gross_rent_household_income_ratio",
    "acs_household_size_by_vehicles_available",
    "acs_internet_access",
    "acs_educational_attainment",
    "acs_hh_income"
]

DDLs = {
"acs_sex_by_age": "zcta VARCHAR, total_population REAL, num_male REAL, num_male_under_5_years REAL, num_male_5_to_9_years REAL, num_male_10_to_14_years REAL, num_male_15_to_17_years REAL, num_male_18_and_19_years REAL, num_male_20_years REAL, num_male_21_years REAL, num_male_22_to_24_years REAL, num_male_25_to_29_years REAL, num_male_30_to_34_years REAL, num_male_35_to_39_years REAL, num_male_40_to_44_years REAL, num_male_45_to_49_years REAL, num_male_50_to_54_years REAL, num_male_55_to_59_years REAL, num_male_60_and_61_years REAL, num_male_62_to_64_years REAL, num_male_65_and_66_years REAL, num_male_67_to_69_years REAL, num_male_70_to_74_years REAL, num_male_75_to_79_years REAL, num_male_80_to_84_years REAL, num_male_85_years_and_over REAL, num_female REAL, num_female_under_5_years REAL, num_female_5_to_9_years REAL, num_female_10_to_14_years REAL, num_female_15_to_17_years REAL, num_female_18_and_19_years REAL, num_female_20_years REAL, num_female_21_years REAL, num_female_22_to_24_years REAL, num_female_25_to_29_years REAL, num_female_30_to_34_years REAL, num_female_35_to_39_years REAL, num_female_40_to_44_years REAL, num_female_45_to_49_years REAL, num_female_50_to_54_years REAL, num_female_55_to_59_years REAL, num_female_60_and_61_years REAL, num_female_62_to_64_years REAL, num_female_65_and_66_years REAL, num_female_67_to_69_years REAL, num_female_70_to_74_years REAL, num_female_75_to_79_years REAL, num_female_80_to_84_years REAL, num_female_85_years_and_over REAL"
,"acs_race": "zcta VARCHAR, total_population REAL, num_white_alone REAL, num_black_or_african_american_alone REAL, num_american_indian_and_alaska_native_alone REAL, num_asian_alone REAL, num_native_hawaiian_and_other_pacific_islander_alone REAL, num_some_other_race_alone REAL, num_two_or_more_races REAL, num_two_or_more_races_two_races_including_some_other_race REAL, num_two_or_more_races_two_races_excluding_some_other_race_and_three_or_more_races REAL"
,"acs_hispanic": "zcta VARCHAR, total_population REAL, num_not_hispanic_or_latino REAL, num_not_hispanic_or_latino_white_alone REAL, num_not_hispanic_or_latino_black_or_african_american_alone REAL, num_not_hispanic_or_latino_american_indian_and_alaska_native_alone REAL, num_not_hispanic_or_latino_asian_alone REAL, num_not_hispanic_or_latino_native_hawaiian_and_other_pacific_islander_alone REAL, num_not_hispanic_or_latino_some_other_race_alone REAL, num_not_hispanic_or_latino_two_or_more_races REAL, num_not_hispanic_or_latino_two_or_more_races_two_races_including_some_other_race REAL, num_not_hispanic_or_latino_two_or_more_races_two_races_excluding_some_other_race_and_three_or_more_races REAL, num_hispanic_or_latino REAL, num_hispanic_or_latino_white_alone REAL, num_hispanic_or_latino_black_or_african_american_alone REAL, num_hispanic_or_latino_american_indian_and_alaska_native_alone REAL, num_hispanic_or_latino_asian_alone REAL, num_hispanic_or_latino_native_hawaiian_and_other_pacific_islander_alone REAL, num_hispanic_or_latino_some_other_race_alone REAL, num_hispanic_or_latino_two_or_more_races REAL, num_hispanic_or_latino_two_or_more_races_two_races_including_some_other_race REAL, num_hispanic_or_latino_two_or_more_races_two_races_excluding_some_other_race_and_three_or_more_races REAL"
,"acs_commute_times": "zcta VARCHAR, total_commuter_population REAL, num_less_than_5_minutes REAL, num_5_to_9_minutes REAL, num_10_to_14_minutes REAL, num_15_to_19_minutes REAL, num_20_to_24_minutes REAL, num_25_to_29_minutes REAL, num_30_to_34_minutes REAL, num_35_to_39_minutes REAL, num_40_to_44_minutes REAL, num_45_to_59_minutes REAL, num_60_to_89_minutes REAL, num_90_or_more_minutes REAL"
,"acs_employment_by_industry": "zcta VARCHAR, total_working_population REAL, num_agriculture_forestry_fishing_and_hunting_and_mining REAL, num_construction REAL, num_manufacturing REAL, num_transportation_and_warehousing_and_utilities REAL, num_information REAL, num_finance_and_insurance_and_real_estate_and_rental_and_leasing REAL, num_professional_scientific_and_management_and_administrative_and_waste_management_services REAL, num_educational_services_and_health_care_and_social_assistance REAL, num_arts_entertainment_and_recreation_and_accommodation_and_food_services REAL, num_other_services REAL, num_public_administration REAL, num_armed_forces REAL"
,"acs_commute_modes": "zcta VARCHAR, total_working_population REAL, num_drove_alone REAL, num_carpooled REAL, num_public_transportation REAL, num_walked REAL, num_taxicab_motorcycle_bicycle_or_other REAL, num_worked_from_home REAL"
,"acs_education_subjects": "zcta VARCHAR, total_num_bachelors_holders REAL, num_science_and_engineering_computers_mathematics_and_statistics REAL, num_science_and_engineering_biological_agricultural_and_environmental_sciences REAL, num_science_and_engineering_physical_and_related_sciences REAL, num_science_and_engineering_psychology REAL, num_science_and_engineering_social_sciences REAL, num_science_and_engineering_engineering REAL, num_science_and_engineering_multidisciplinary_studies REAL, num_science_and_engineering_related_fields REAL, num_business REAL, num_education REAL, num_arts_humanities_and_other_literature_and_languages REAL, num_arts_humanities_and_other_liberal_arts_and_history REAL, num_arts_humanities_and_other_visual_and_performing_arts REAL, num_arts_humanities_and_other_communications REAL, num_arts_humanities_and_other_other REAL"
,"acs_housing_year_built": "zcta VARCHAR, total_structures REAL, num_built_2020_or_later REAL, num_built_2010_to_2019 REAL, num_built_2000_to_2009 REAL, num_built_1990_to_1999 REAL, num_built_1980_to_1989 REAL, num_built_1970_to_1979 REAL, num_built_1960_to_1969 REAL, num_built_1950_to_1959 REAL, num_built_1940_to_1949 REAL, num_built_1939_or_earlier REAL"
,"acs_poverty_status": "zcta VARCHAR, total_households REAL, num_income_in_the_past_12_months_below_poverty_level REAL, num_income_in_the_past_12_months_below_poverty_level_under_6_years REAL, num_income_in_the_past_12_months_below_poverty_level_6_to_11_years REAL, num_income_in_the_past_12_months_below_poverty_level_12_to_17_years REAL, num_income_in_the_past_12_months_below_poverty_level_18_to_59_years REAL, num_income_in_the_past_12_months_below_poverty_level_60_to_74_years REAL, num_income_in_the_past_12_months_below_poverty_level_75_to_84_years REAL, num_income_in_the_past_12_months_below_poverty_level_85_years_and_over REAL, num_income_in_the_past_12_months_at_or_above_poverty_level REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_under_6_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_6_to_11_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_12_to_17_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_18_to_59_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_60_to_74_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_75_to_84_years REAL, num_income_in_the_past_12_months_at_or_above_poverty_level_85_years_and_over REAL"
,"acs_ratio_of_income_to_poverty_level": "zcta VARCHAR, total_households REAL, num_under_50_pct REAL, num_50_to_74_pct REAL, num_75_to_99_pct REAL, num_100_to_124_pct REAL, num_125_to_149_pct REAL, num_150_to_174_pct REAL, num_175_to_184_pct REAL, num_185_to_199_pct REAL, num_200_to_299_pct REAL, num_300_to_399_pct REAL, num_400_to_499_pct REAL, num_500_pct_and_over REAL"
,"acs_earners_in_household": "zcta VARCHAR, total_num_earners FLOAT, num_no_earners FLOAT, num_1_earner FLOAT, num_2_earners FLOAT, num_3_or_more_earners FLOAT"
,"acs_housing": "zcta TEXT, num_housing_units REAL, num_heated_by_utility_gas REAL, num_heated_by_bottled_tank_or_lp_gas REAL, num_heated_by_electricity REAL, num_heated_by_fuel_oil_kerosene_etc REAL, num_heated_by_coal_or_coke REAL, num_heated_by_wood REAL, num_heated_by_solar_energy REAL, num_heated_by_other_fuel REAL, num_heated_by_no_fuel_used REAL, num_owner_occupied REAL, num_renter_occupied REAL, num_complete_kitchen_facilities REAL, num_lacking_complete_kitchen_facilities REAL, aggregate_gross_rent REAL, aggregate_price_asked REAL, median_home_value REAL, num_housing_units_with_a_mortgage REAL, num_housing_units_with_a_mortgage_with_no_second_mortgage REAL, num_housing_units_without_a_mortgage REAL, total_not_charged_not_used_or_payment_included_in_other_fees REAL, total_charged_for_electricity REAL, total_charged_for_electricity_less_than_usd_50 REAL, total_charged_for_electricity_usd_50_to_usd_99 REAL, total_charged_for_electricity_usd_100_to_usd_149 REAL, total_charged_for_electricity_usd_150_to_usd_199 REAL, total_charged_for_electricity_usd_200_to_usd_249 REAL, total_charged_for_electricity_usd_250_or_more REAL, total_not_charged_or_payment_included_in_other_fees REAL, total_charged_for_water_and_sewer REAL, total_charged_for_water_and_sewer_less_than_usd_125 REAL, total_charged_for_water_and_sewer_usd_125_to_usd_249 REAL, total_charged_for_water_and_sewer_usd_250_to_usd_499 REAL, total_charged_for_water_and_sewer_usd_500_to_usd_749 REAL, total_charged_for_water_and_sewer_usd_750_to_usd_999 REAL, total_charged_for_water_and_sewer_usd_1000_or_more REAL, num_has_one_or_more_types_of_computing_devices REAL, num_has_one_or_more_types_of_computing_devices_desktop_or_laptop REAL, num_no_computer REAL"
,"acs_medicare": "zcta TEXT, total_population REAL, num_allocated_medicare REAL, num_not_allocated_medicare REAL"
,"acs_gross_rent_household_income_ratio": "zcta VARCHAR, num_hh_less_than_10pct REAL, num_hh_10pct_to_15pct REAL, num_hh_15pct_to_20pct REAL, num_hh_20pct_to_25pct REAL, num_hh_25pct_to_30pct REAL, num_hh_30pct_to_35pct REAL, num_hh_35pct_to_40pct REAL, num_hh_40pct_to_45pct REAL, num_hh_45pct_to_50pct REAL, num_hh_50pct_or_more REAL, num_hh_ratio_not_computed REAL"
,"acs_household_size_by_vehicles_available": "zcta STRING, num_hh REAL, num_hh_0_vehicles REAL, num_hh_1_vehicle REAL, num_hh_2_vehicles REAL, num_hh_3_vehicles REAL, num_hh_4_or_more_vehicles REAL, num_1_person_hh REAL, num_1_person_hh_0_vehicles REAL, num_1_person_hh_1_vehicles REAL, num_1_person_hh_2_vehicles REAL, num_1_person_hh_3_vehicles REAL, num_1_person_hh_4_or_more_vehicles REAL, num_2_person_hh REAL, num_2_person_hh_no_vehicle REAL, num_2_person_hh_1_vehicle REAL, num_2_person_hh_2_vehicles REAL, num_2_person_hh_3_vehicles REAL, num_2_person_hh_4_or_more_vehicles REAL, num_3_person_hh REAL, num_3_person_hh_no_vehicle REAL, num_3_person_hh_1_vehicle REAL, num_3_person_hh_2_vehicles REAL, num_3_person_hh_3_vehicles REAL, num_3_person_hh_4_or_more_vehicles REAL, num_4_or_more_person_hh REAL, num_4_or_more_person_hh_no_vehicle REAL, num_4_or_more_person_hh_1_vehicle REAL, num_4_or_more_person_hh_2_vehicles REAL, num_4_or_more_person_hh_3_vehicles REAL, num_4_or_more_person_hh_4_or_more_vehicles REAL"
,"acs_internet_access": "zcta TEXT, num_hh FLOAT, num_hh_with_internet FLOAT, num_hh_with_internet_via_dialup FLOAT, num_hh_with_internet_via_cable_fiber_optic_or_dsl FLOAT, num_hh_with_internet_via_satellite_internet FLOAT, num_hh_with_internet_via_other FLOAT, num_hh_with_no_internet FLOAT"
,"acs_educational_attainment": "zcta TEXT, num_pop_25_and_older FLOAT, num_less_than_high_school FLOAT, num_at_least_high_school FLOAT, num_at_least_some_college FLOAT, num_at_least_bachelors FLOAT, num_at_least_graduate_or_professional_degree FLOAT"
,"acs_hh_income": "zcta TEXT, num_hh FLOAT, num_hh_lt_10k FLOAT, num_hh_10k_to_15k FLOAT, num_hh_15k_to_19k FLOAT, num_hh_20k_to_24k FLOAT, num_hh_25k_to_29k FLOAT, num_hh_30k_to_34k FLOAT, num_hh_35k_to_39k FLOAT, num_hh_40k_to_44k FLOAT, num_hh_45k_to_49k FLOAT, num_hh_50k_to_59k FLOAT, num_hh_60k_to_74k FLOAT, num_hh_75k_to_99k FLOAT, num_hh_100k_to_124k FLOAT, num_hh_125k_to_149k FLOAT, num_hh_150k_to_199k FLOAT, num_hh_200k_or_more FLOAT, median_hh_income FLOAT"
}


In [12]:
df = pd.read_csv('questions.csv').iloc[100:]
df['query'] = ''

In [13]:
data = []
for idx, row in df.iterrows():
    print(idx)
    print(row['question'])
    
    if row['query'] != '':
        print('already completed')
        data.append(row['query'])
        continue
        
    query = text_to_sql(tables, DDLs, row['question'])
    data.append(query)
    df.loc[idx,'query'] = query

df

100
Which areas have the most kids?
['acs_sex_by_age']
 SELECT zcta, (num_male_under_5_years + num_male_5_to_9_years + num_male_10_to_14_years + num_male_15_to_17_years + num_female_under_5_years + num_female_5_to_9_years + num_female_10_to_14_years + num_female_15_to_17_years) / total_population AS fraction_kids FROM acs_sex_by_age WHERE total_population > 0 ORDER BY fraction_kids DESC
101
Which areas have the most retirees?
['acs_sex_by_age', 'acs_educational_attainment', 'acs_medicare']
 SELECT zcta, (num_65_and_66_years + num_67_to_69_years + num_70_to_74_years + num_75_to_79_years + num_80_to_84_years + num_85_years_and_over) / total_population AS fraction_retirees FROM acs_sex_by_age WHERE total_population > 0 ORDER BY fraction_retirees DESC
102
Which areas have the highest ratio of working age men to women?
['acs_sex_by_age', 'acs_race', 'acs_hispanic']
 SELECT zcta, (num_male_18_and_19_years + num_male_20_years + num_male_21_years + num_male_22_to_24_years + num_male_25_to_29_y

['acs_housing', 'acs_housing_year_built']
 SELECT zcta, num_heated_by_utility_gas / num_housing_units AS fraction_heated_by_utility_gas FROM acs_housing WHERE num_housing_units > 0 ORDER BY fraction_heated_by_utility_gas DESC
125
Which ZIP codes have the most renters?
['acs_housing', 'acs_gross_rent_household_income_ratio', 'acs_household_size_by_vehicles_available']

SELECT zcta, num_renter_occupied / num_housing_units AS fraction_renters FROM acs_housing WHERE num_housing_units > 0 ORDER BY fraction_renters DESC
126
Where do people spend the most on electricity?
['acs_housing', 'acs_housing_year_built']
 SELECT zcta, (total_charged_for_electricity_usd_50_to_usd_99 + total_charged_for_electricity_usd_100_to_usd_149 + total_charged_for_electricity_usd_150_to_usd_199 + total_charged_for_electricity_usd_200_to_usd_249 + total_charged_for_electricity_usd_250_or_more) / total_charged_for_electricity AS fraction_electricity FROM acs_housing ORDER BY fraction_electricity DESC
127
Where do pe

['acs_education_subjects', 'acs_educational_attainment']
 SELECT zcta, num_at_least_graduate_or_professional_degree / num_pop_25_and_older AS fraction_advanced_degrees FROM acs_educational_attainment WHERE num_pop_25_and_older > 0 ORDER BY fraction_advanced_degrees DESC
150
Show me where people dropped out of college.
['acs_education_subjects', 'acs_educational_attainment', 'acs_earners_in_household']
 SELECT zcta, (num_at_least_college - num_at_least_bachelors) / num_at_least_college AS fraction_dropped FROM acs_educational_attainment WHERE num_at_least_college > 0 ORDER BY fraction_dropped DESC


Unnamed: 0,question,query
100,Which areas have the most kids?,"SELECT zcta, (num_male_under_5_years + num_ma..."
101,Which areas have the most retirees?,"SELECT zcta, (num_65_and_66_years + num_67_to..."
102,Which areas have the highest ratio of working ...,"SELECT zcta, (num_male_18_and_19_years + num_..."
103,Which areas are mostly white?,"SELECT zcta, num_white_alone / total_populati..."
104,Show me the black population by ZCTA.,"SELECT zcta, num_black_or_african_american_al..."
105,Where do the most Asians live?,"SELECT zcta, num_asian_alone / total_populati..."
106,Show me the most Hispanic areas.,"SELECT zcta, num_hispanic_or_latino / total_po..."
107,How does the fraction of white hispanic vs non...,"SELECT zcta, (num_not_hispanic_or_latino_whit..."
108,Show me the most white non-Hispanic areas.,"SELECT zcta, num_not_hispanic_or_latino_white..."
109,areas with long commutes,"SELECT zcta, (num_60_to_89_minutes + num_90_o..."


In [14]:
df.to_clipboard()

In [None]:
df_copy = df.copy()

In [15]:
choose_table(tables, "Show me the number of people per household")

['acs_household_size_by_vehicles_available', 'acs_housing', 'acs_housing_year_built']


['acs_household_size_by_vehicles_available',
 'acs_housing',
 'acs_housing_year_built']