# Set environment

In [39]:
### basic tools
import numpy as np
import pandas as pd

### show or plot
from IPython.display import display, HTML

### database
import sqlite3

### directory
datadir = "./data/"

### helper function
head = lambda x: x[:6]
tail = lambda x: x[-6:]

# Working with Relational Databases and SQL

**1**. 75 points

Convert the flat file data in `data/flat.csv` into a well-structured relational database in SQLite3 stored as `data/faculty.db`. Note - salary information is confidential and should be kept in a separate table from other personal data.

-----

take a look at the csv file

In [7]:
flat = pd.read_csv(datadir + 'flat.csv', keep_default_na = False)

print(flat.shape)
flat.sample(10)

(1523, 14)


Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
770,Keneth Thornton,Male,54,1.76,87,58000,Russian,RU,Russia,,,,Keneth,Thornton
3,Abram Boyer,Male,45,1.64,68,76000,Italian,IT,Italy,Lua,Falcon,Io,Abram,Boyer
123,Barrett Coleman,Male,64,1.84,53,97000,Estonian,EE,Estonia,Prolog,Scheme,,Barrett,Coleman
1213,Roger Lambert,Male,62,1.98,45,92000,Danish,DK,Denmark,PL-SQL,Java,,Roger,Lambert
282,Dannie Glover,Male,28,1.66,89,140000,Dominican,DM,Dominica,GNU Octave,PHP,Scala,Dannie,Glover
456,Eulah Zamora,Female,30,1.74,47,168000,Costa Rican,CR,Costa Rica,Tcl,,,Eulah,Zamora
722,Junie Patterson,Female,34,1.51,68,109000,Iranian,IR,Iran,Elixir,Dylan,,Junie,Patterson
640,Jamar Adams,Male,28,1.63,82,51000,Estonian,EE,Estonia,Z shell,,,Jamar,Adams
59,Andrew Mckee,Female,58,1.78,53,113000,Canadian,CA,Canada,Scheme,PL-I,Prolog,Andrew,Mckee
1265,Santiago Vang,Male,31,1.97,68,78000,Irish,IE,Ireland,Go,,,Santiago,Vang


In [22]:
# Connecting to the database file
con = sqlite3.connect(datadir + "faculty.db")
cur = con.cursor()

**Design tables**

*Person*

| pid | first | last   | age | height | weight |
|-----|-------|--------|-----|--------|--------|
|  1  | Eulah | Zamora |  45 |  1.76  |   87   |

*Language*

| lid | language |
|-----|----------|
|  1  |    Go    |

*Country*  

| cid | code | country | nationality |
|-----|------|---------|-------------|
|  1  |  RU  | Russia  |    Russian  |

*Salary*

| pid | salary |
|-----|--------|
|  1  |  58000 |

*linker*


In [24]:
cur.execute("DROP TABLE IF EXISTS Person;")

<sqlite3.Cursor at 0x7f6971115ce0>

In [25]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[]


In [20]:
cur.execute(
    "CREATE TABLE Person("
    "person_id INTEGER PRIMARY KEY," +
    "person_first varchar(255),"     +
    "person_last  varchar(255),"     +
    "age          INTEGER,"          +
    "height       FLOAT(3)"          +
    ");")

<sqlite3.Cursor at 0x7f6971115a40>

In [26]:
cur.execute("""
    CREATE TABLE Person(
    person_id INTEGER PRIMARY KEY,
    person_first varchar(255),
    person_last  varchar(255),
    age          INTEGER,    
    height       FLOAT(3),
    weight       INTEGER
    );""")

<sqlite3.Cursor at 0x7f6971115ce0>

Check if the table is created

In [27]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Person',)]


-----

In [28]:
cur.execute("""
    SELECT person_id as ID, person_first AS first
    FROM Person 
    """)
print(cur.fetchall())

[]


In [34]:
flat.iloc[0:2, :]

Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
0,Aaron Alexander,Male,54,1.7,90,151000,British,GB,United Kingdom,Haskell,,,Aaron,Alexander
1,Aaron Kirby,Male,59,1.69,43,80000,Spanish,SP,Spain,Falcon,haXe,GNU Octave,Aaron,Kirby


In [35]:
cur.execute("""
    INSERT INTO Person(person_first, person_last, age, height) 
    VALUES 
    ('Aaron', 'Alexander', 54, 1.70),
    ('Aaron', 'Kirby',     59, 1.69);
    """)

<sqlite3.Cursor at 0x7f6971115ce0>

In [51]:
pd.read_sql_query("""SELECT * FROM Person""", con = con)
#df = pd.DataFrame(cur.fetchall())
#display(df)
#cur.fetchall()

Unnamed: 0,person_id,person_first,person_last,age,height
0,1,Aaron,Alexander,54,1.7
1,2,Aaron,Kirby,59,1.69


In [46]:
cur.execute("""
    SELECT person_id AS id, person_first AS first
    FROM Person 
    """)

df = pd.DataFrame(cur.fetchall())
display(df)

Unnamed: 0,0,1
0,1,Aaron
1,2,Aaron


In [43]:
df.to_sql('users', con = con)

In [44]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Person',), ('users',)]


In [56]:
tmp = pd.read_sql_query("""SELECT * FROM Person""", con = con)
tmp.to_sql('users', con = con, if_exists='replace')

cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

pd.read_sql_query("""SELECT * FROM Users""", con = con)

[('Person',), ('users',)]


Unnamed: 0,index,person_id,person_first,person_last,age,height
0,0,1,Aaron,Alexander,54,1.7
1,1,2,Aaron,Kirby,59,1.69


-----

**2**. 25 points

We want to find potential mentors for Abram	Boyer. Find all faculty members who know one or more of the same languages as Christopher Robbins whose salary is at least $50,000 higher than his using SQL statements. Assume that the only information you have is that you need to find mentors meeting the criteria for the faculty member named `Abram Boyer`. In other words, the ONLY hard coded terms in your SQL query are `Abram` and `Boyer` and the salary differential.

You can use the `sql` magic extension or the `sqlite3` driver for this question.