## SQL Salary and Consumer Survey
by Wolfrank Guzman

In [9]:
import pandas as pd
import random
import faker

# Create a Faker instance to generate fake data
fake = faker.Faker()

# Define the number of rows in the DataFrame
num_rows = 200

# Initialize empty lists for each column
RecordID = [i for i in range(1, num_rows + 1)]

# List of random sexes
sexes = [random.choice(["Male", "Female"]) for _ in range(num_rows)]

# Generate random first names starting with a random letter based on sex
first_names = [fake.first_name_male() if sex == "Male" else fake.first_name_female() for sex in sexes]

last_names = [fake.last_name() for _ in range(num_rows)]

# List of email endings
email_endings = ["@gmail.com", "@yahoo.com", "@msn.com", "@comcast.net", "@fios.net", "@myspace.com", "@microsoft.net", "@lycos.net"]

# Generate random email addresses using first names
emails = [f"{first.lower()}{random.choice(email_endings)}" for first in first_names]

# Generate other random data
occupations = [random.choice(["Software Developer", "Nurse Practitioner","Software Developer", "Nurse Practitioner", "Electrician", "Marketing Manager", "Dental Hygienist", "Mechanical Engineer", "Social Worker", "Pharmacist", "Financial Analyst", "Physical Therapist", "Graphic Designer", "Veterinarian", "Chef", "Police Officer", "Architect", "Occupational Therapist", "Teacher", "Accountant", "Civil Engineer", "Psychologist", "Plumber", "Web Designer", "Pharmacist Technician", "Registered Nurse", "Lawyer", "Biomedical Engineer", "Librarian", "Human Resources Manager", "Dental Assistant", "Data Analyst", "Radiologic Technologist", "Construction Worker", "Artist", "Environmental Scientist", "Financial Advisor", "Physical Education Teacher", "Chef", "Firefighter", "IT Manager", "Medical Technologist", "Flight Attendant", "Geologist", "Sales Manager", "Speech-Language Pathologist", "Electrician", "Marketing Coordinator", "Physician", "Event Planner", "Economist", "Nurse", "Aerospace Engineer", "Social Media Manager", "Pharmacist", "Investment Banker", "Occupational Therapist", "Professor", "Accountant", "Civil Engineer", "Psychologist", "Plumber", "UX Designer", "Pharmacy Technician", "Registered Nurse", "Journalist", "Biomedical Scientist", "Librarian", "HR Specialist", "Dental Hygienist", "Biotechnologist", "Physical Therapist Assistant", "Police Officer", "Architect", "Elementary School Teacher", "Financial Planner", "Chef", "Game Developer", "Environmental Engineer", "Graphic Designer", "Pediatrician", "Marketing Analyst"]) for _ in range(num_rows)]
countries_of_birth = [fake.country() for _ in range(num_rows)]
salaries = [random.randint(45000, 385000) for _ in range(num_rows)]
education = [random.choice(['BA/BS', 'HS','Graduate','GED', 'Doctorate', 'Masters']) for _ in range(num_rows)]
age = [random.randint(22, 65) for _ in range(num_rows)]
years_of_experience = []
for a in age:
    if 22 <= a <= 30:
        years_of_experience.append(random.randint(0, 5))
    elif 30 < a <= 40:
        years_of_experience.append(random.randint(6, 10))
    elif 41 <= a <= 50:
        years_of_experience.append(random.randint(10, 18))
    else:
        years_of_experience.append(random.randint(18, 25))

avg_salary_occupation = [random.randint(30000, 220000) for _ in range(num_rows)]


# List of real U.S. state names
us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida",
    "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
    "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
    "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
    "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

# Generate random states from the list of real U.S. state names
states = [random.choice(us_states) for _ in range(num_rows)]

# Create a DataFrame from the lists
df = pd.DataFrame({
    'RecordID': RecordID,
    'First Name': first_names,
    'Last Name': last_names,
    'Sex': sexes,
    'Age': age,
    'Occupation': occupations,
    'Country of Birth': countries_of_birth,
    'Salary': salaries,
    'Education': education,
    'Email': emails,
    'Years of Experience': years_of_experience,
    'Avg Salary for Occupation': avg_salary_occupation,
    'State': states,

})

# Display the first few rows of the DataFrame
display(df)


Unnamed: 0,RecordID,First Name,Last Name,Sex,Age,Occupation,Country of Birth,Salary,Education,Email,Years of Experience,Avg Salary for Occupation,State
0,1,Richard,Johnson,Male,28,Human Resources Manager,Mali,70597,BA/BS,richard@gmail.com,3,164785,Kansas
1,2,William,Black,Male,31,Civil Engineer,Pitcairn Islands,60678,Graduate,william@comcast.net,8,140270,Ohio
2,3,Karen,Cox,Female,24,Civil Engineer,Macao,197519,GED,karen@gmail.com,3,80630,New Mexico
3,4,Nathaniel,Lee,Male,65,Financial Analyst,Saint Lucia,76147,GED,nathaniel@msn.com,18,150896,Michigan
4,5,Thomas,Williams,Male,65,Police Officer,Isle of Man,62169,GED,thomas@gmail.com,24,116651,Arkansas
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,Heather,Anderson,Female,47,Dental Assistant,San Marino,138643,Doctorate,heather@microsoft.net,18,74518,Oregon
196,197,Maria,Garcia,Female,44,Dental Hygienist,United Arab Emirates,54867,GED,maria@myspace.com,10,197521,Alabama
197,198,Robert,Perry,Male,33,Civil Engineer,Belarus,189965,Graduate,robert@yahoo.com,6,183567,Minnesota
198,199,Emily,Wagner,Female,24,Dental Hygienist,Azerbaijan,110305,Graduate,emily@comcast.net,2,57960,Nebraska


In [10]:
import sqlite3

# Create and connect to a SQLite database
conn = sqlite3.connect('data.db')

# Create a DataFrame and store it in the database
df.to_sql('data', conn, if_exists='replace')

# Create a cursor for executing SQL queries
cursor = conn.cursor()


# 3. Select the average salary for all occupations
cursor.execute("SELECT AVG(Salary) FROM data")
average_salary = cursor.fetchone()[0]
print("\n3. Average Salary:", average_salary)



3. Average Salary: 207177.215


In [11]:


# 5. Select the records of individuals with salaries above $100,000
cursor.execute("SELECT * FROM data WHERE Salary > 200000")
results = cursor.fetchall()
print("\n5. Records of individuals with salaries above $200,000:")
for row in results:
    print(row)




5. Records of individuals with salaries above $200,000:
(5, 6, 'Lisa', 'Morris', 'Female', 27, 'Electrician', 'Cuba', 338878, 'Masters', 'lisa@yahoo.com', 1, 180038, 'Oklahoma')
(6, 7, 'Cassandra', 'Scott', 'Female', 33, 'Event Planner', 'Grenada', 337009, 'Doctorate', 'cassandra@lycos.net', 7, 33813, 'Florida')
(8, 9, 'Jeffrey', 'Bowman', 'Male', 60, 'Software Developer', 'Gibraltar', 254018, 'Doctorate', 'jeffrey@lycos.net', 18, 103054, 'Nevada')
(9, 10, 'Benjamin', 'Houston', 'Male', 51, 'Librarian', 'Saudi Arabia', 316569, 'HS', 'benjamin@yahoo.com', 20, 89997, 'Arkansas')
(11, 12, 'Keith', 'Watson', 'Male', 29, 'Registered Nurse', 'France', 284630, 'BA/BS', 'keith@lycos.net', 1, 86002, 'Louisiana')
(12, 13, 'Kevin', 'Lewis', 'Male', 48, 'Nurse Practitioner', 'Mauritius', 220206, 'Masters', 'kevin@yahoo.com', 10, 165661, 'Florida')
(15, 16, 'Dennis', 'Garcia', 'Male', 62, 'Medical Technologist', 'Colombia', 228832, 'HS', 'dennis@myspace.com', 18, 79557, 'Minnesota')
(16, 17, 'Chri

In [12]:

# 6. Select the youngest person's record
cursor.execute("SELECT * FROM data ORDER BY Age ASC LIMIT 1")
youngest_person = cursor.fetchone()
print("\n6. Youngest Person:", youngest_person)




6. Youngest Person: (83, 84, 'John', 'Welch', 'Male', 22, 'Financial Planner', 'Dominican Republic', 102387, 'Doctorate', 'john@gmail.com', 0, 45276, 'Georgia')


In [13]:


# 7. Select the occupations and their average salaries
cursor.execute("SELECT Occupation, AVG(Salary) FROM data GROUP BY Occupation")
results = cursor.fetchall()
print("\n7. Average salary for each occupation:")
for row in results:
    print(row)
    



7. Average salary for each occupation:
('Accountant', 277184.5)
('Aerospace Engineer', 379210.0)
('Architect', 158660.33333333334)
('Artist', 226391.0)
('Biomedical Engineer', 98995.0)
('Biomedical Scientist', 261525.0)
('Biotechnologist', 203834.75)
('Chef', 155546.2857142857)
('Civil Engineer', 183090.88888888888)
('Construction Worker', 288680.5)
('Dental Assistant', 236994.83333333334)
('Dental Hygienist', 136559.6)
('Economist', 154316.0)
('Electrician', 353879.3333333333)
('Elementary School Teacher', 215391.33333333334)
('Environmental Engineer', 344858.0)
('Event Planner', 138074.8)
('Financial Advisor', 255878.4)
('Financial Analyst', 205701.0)
('Financial Planner', 117503.5)
('Firefighter', 230609.33333333334)
('Flight Attendant', 178857.0)
('Game Developer', 327359.5)
('Geologist', 235038.5)
('Graphic Designer', 224270.2857142857)
('HR Specialist', 265688.6666666667)
('Human Resources Manager', 171618.5)
('IT Manager', 173366.0)
('Investment Banker', 109823.0)
('Journalist'

In [14]:


# 8. Count the number of people in each age group
cursor.execute("SELECT CASE \
                 WHEN Age BETWEEN 22 AND 30 THEN '22-30' \
                 WHEN Age BETWEEN 31 AND 40 THEN '31-40' \
                 WHEN Age BETWEEN 41 AND 50 THEN '41-50' \
                 ELSE '51 and over' \
               END AS Age_Group, COUNT(*) \
               FROM data GROUP BY Age_Group")
results = cursor.fetchall()
print("\n8. Count of people in each age group:")
for row in results:
    print(row)




8. Count of people in each age group:
('22-30', 40)
('31-40', 47)
('41-50', 45)
('51 and over', 68)


In [15]:



# 9. Select the highest salary and the corresponding occupation
cursor.execute("SELECT Occupation, MAX(Salary) FROM data")
result = cursor.fetchone()
print("\n9. Occupation with Highest Salary:", result)




9. Occupation with Highest Salary: ('Pharmacist Technician', 380813)


In [16]:


# Close the database connection
conn.close()
