# Set environment

In [1]:
### basic tools
import numpy as np
import pandas as pd

### show or plot
from IPython.display import display, HTML

### database
import sqlite3

### directory
datadir = "./data/"

### helper function
head = lambda x: x[:6]
tail = lambda x: x[-6:]

# Working with Relational Databases and SQL

**1**. 75 points

Convert the flat file data in `data/flat.csv` into a well-structured relational database in SQLite3 stored as `data/faculty.db`. Note - salary information is confidential and should be kept in a separate table from other personal data.

-----

take a look at the csv file

In [2]:
flat = pd.read_csv(datadir + 'flat.csv', keep_default_na = False)

print(flat.shape)
flat.sample(10)

(1523, 14)


Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
434,Emmaline Sutton,Female,22,1.63,47,85000,Irish,IE,Ireland,,,,Emmaline,Sutton
522,Genesis Rollins,Female,17,1.56,63,68000,Ukrainian,UA,Ukraine,,,,Genesis,Rollins
282,Dannie Glover,Male,28,1.66,89,140000,Dominican,DM,Dominica,GNU Octave,PHP,Scala,Dannie,Glover
870,Lindsay Jimenez,Male,55,1.64,61,77000,Ethiopian,ET,Ethiopia,,,,Lindsay,Jimenez
1023,Milagro Horton,Female,48,2.0,51,44000,Brazilian,BR,Brazil,ECMAScript,Lisp,,Milagro,Horton
60,Andy Pennington,Male,48,1.73,55,90000,Swedish,SE,Sweden,AutoIt,Io,,Andy,Pennington
79,Anya Elliott,Female,45,1.74,57,98000,Dominican,DM,Dominica,J#,JavaScript,,Anya,Elliott
334,Denisse Browning,Female,23,1.73,63,109000,Ethiopian,ET,Ethiopia,,,,Denisse,Browning
1019,Mickey Joyce,Male,65,1.7,79,79000,German,DE,Germany,Ruby,Prolog,Lisp,Mickey,Joyce
998,Meda Peck,Female,36,1.62,64,74000,Swiss,CH,Switzerland,Go,Clojure,,Meda,Peck


In [106]:
df_person_id = (flat
                [["name"]]
                .drop_duplicates()
                .reset_index(drop = True)
                .reset_index()
                .rename(index  = str, 
                        columns = {"index": "person_id"}))

flat_pid = pd.merge(df_person_id, flat, on = 'name', how = 'inner')
flat_pid.head()

Unnamed: 0,person_id,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
0,0,Aaron Alexander,Male,54,1.7,90,151000,British,GB,United Kingdom,Haskell,,,Aaron,Alexander
1,1,Aaron Kirby,Male,59,1.69,43,80000,Spanish,SP,Spain,Falcon,haXe,GNU Octave,Aaron,Kirby
2,2,Abram Allen,Male,41,1.7,44,75000,Italian,IT,Italy,TypeScript,,,Abram,Allen
3,3,Abram Boyer,Male,45,1.64,68,76000,Italian,IT,Italy,Lua,Falcon,Io,Abram,Boyer
4,4,Adaline Barry,Female,54,1.87,58,122000,Uruguayan,UY,Uruguay,Racket,,,Adaline,Barry


In [139]:
df_lang         = (pd.melt(flat, 
                           value_vars = ['language1', 'language2', 'language3'], 
                           value_name = "language_name")
                   [["language_name"]]
                   .drop_duplicates()
                   .query("language_name != ''")
                   .reset_index(drop = True)
                   .reset_index()
                   .rename(index  = str, 
                           columns = {"index": "language_id"}))
###########################################
df_country      = (flat
                   .loc[:, ["code", "country", "nationality"]]
                   .drop_duplicates()
                   .rename(index  = str, 
                           columns = {"code": "country_id"}))  
###########################################
df_confidential = (flat_pid
                   .loc[:, ["person_id", "salary"]]
                   .drop_duplicates())  
###########################################
df_gender       = (flat
                   [["gender"]]
                   .drop_duplicates()
                   .reset_index(drop = True)
                   .reset_index()
                   .rename(index  = str, 
                           columns = {"index": "gender_id"}))  
###########################################

df_person       = (flat_pid
                   .loc[:, [
                       "person_id", "first", "last", "age", "height", "weight", 
                       "code", "gender"]]
                   .drop_duplicates()
                   .rename(index = str, 
                           columns = {"code": "country_id"})) 
df_person       = pd.merge(df_person, df_gender, on = "gender").drop(columns = ["gender"])

###########################################


In [145]:

df = (pd.melt(flat_pid, 
              id_vars = ["person_id"],
              value_vars = ['language1', 'language2', 'language3'], 
              value_name = "language_name")
      .query("language_name != ''"))

df = pd.merge(df, df_lang, on = "language_name")
#df = df.drop(columns = ['variable', 'language_name'])

df_person_lang = df
df.sample(10)

Unnamed: 0,person_id,variable,language_name,language_id
1128,439,language3,Transact-SQL,27
461,129,language1,C#,12
96,1205,language1,TypeScript,2
323,1445,language1,Assembly,8
1169,48,language1,Scheme,29
777,1320,language1,Perl,19
1188,1270,language1,Scheme,29
901,1258,language2,JavaScript,21
446,1268,language2,Caml,11
2118,960,language1,Ceylon,54


In [None]:
pd.melt(id_vars=)

In [136]:
df = flat_pid.loc[:, ["person_id", "language1", "language2", "language3"]]

df = (pd.melt(df, 
              id_vars = ["person_id"],
              value_vars = ['language1', 'language2', 'language3'], 
              value_name = "language_name")
      .query("language_name != ''"))

df = pd.merge(df, df_lang, on = "language_name")
df = df.drop(columns = ['variable', 'language_name'])
df.sample(10)

Unnamed: 0,person_id,language_id
435,1277,11
366,1005,9
897,1062,21
1980,1096,50
1298,844,31
1566,1151,38
2075,501,52
1823,139,45
446,1268,11
142,939,3


In [92]:
display(df_person_id.head(3))
display(df_lang.head(3))
display(df_country.head(3))
display(df_confidential.head(3))
display(df_gender.head(3))

Unnamed: 0,person_id,name
0,0,Aaron Alexander
1,1,Aaron Kirby
2,2,Abram Allen


Unnamed: 0,language_id,language_name
0,0,Haskell
1,1,Falcon
2,2,TypeScript


Unnamed: 0,country_id,country,nationality
0,GB,United Kingdom,British
1,SP,Spain,Spanish
2,IT,Italy,Italian


Unnamed: 0,person_id,salary
0,0,151000
1,1,80000
2,2,75000


Unnamed: 0,gender_id,gender
0,0,Male
1,1,Female


Since the "data" folder already contain the faculty.db which is used in lecture, here I change the name as faculty_kuei.db

In [22]:
# Connecting to the database file
con = sqlite3.connect(datadir + "faculty_kuei.db")
cur = con.cursor()

**Design tables**

*Person*

| index | person_id| first | last      | age | height | weight | country_id | gender_id |
|-------|----------|-------|-----------|-----|--------|--------|------------|-----------|
|   0   |    0     | Aaron | Alexander | 54  |  1.7   |  90    |   GB       |     0     |
|   1   |    1     | Aaron | Kirby     | 59  |  1.69  |  43    |   SP       |     0     |
|   2   |    2     | Abram | Allen     | 41  |  1.7   |  44    |   IT       |     0     |


*Language*

|index | language_id | language_name |
|------|-------------|---------------|
|  0   |      1      |     PHP       |
|  1   |      2      |     Clojure   |
|  2   |      3      |     Dylan     |


*Person_Language*

| index | person_id | language_id |
|-------|-----------|-------------|
|   0   |     0     |     20      |
|   1   |    68     |     20      |
|   2   |    80     |     20      |



*Country*  

| index | country_id | country        | nationality |
|-------|------------|----------------|-------------|
|   0   |     GB     | United Kingdom | British     |
|   1   |     SP     | Spain          | Spanish     |
|   2   |     IT     | Italy          | Italian     |

*Confidential*

| index | person_id | salary |
|-------|-----------|--------|
|   0   |    0      | 151000 |
|   1   |    1      |  80000 |
|   2   |    2      |  75000 |

*Gender*

| index | gender_id | gender |
|-------|-----------|--------|
|   0   |     0     | Male   |
|   1   |     1     | Female | 


In [24]:
cur.execute("DROP TABLE IF EXISTS Person;")

<sqlite3.Cursor at 0x7f6971115ce0>

In [25]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[]


In [20]:
cur.execute(
    "CREATE TABLE Person("
    "person_id INTEGER PRIMARY KEY," +
    "person_first varchar(255),"     +
    "person_last  varchar(255),"     +
    "age          INTEGER,"          +
    "height       FLOAT(3)"          +
    ");")

<sqlite3.Cursor at 0x7f6971115a40>

In [26]:
cur.execute("""
    CREATE TABLE Person(
    person_id INTEGER PRIMARY KEY,
    person_first varchar(255),
    person_last  varchar(255),
    age          INTEGER,    
    height       FLOAT(3),
    weight       INTEGER
    );""")

<sqlite3.Cursor at 0x7f6971115ce0>

Check if the table is created

In [27]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Person',)]


-----

In [28]:
cur.execute("""
    SELECT person_id as ID, person_first AS first
    FROM Person 
    """)
print(cur.fetchall())

[]


In [34]:
flat.iloc[0:2, :]

Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
0,Aaron Alexander,Male,54,1.7,90,151000,British,GB,United Kingdom,Haskell,,,Aaron,Alexander
1,Aaron Kirby,Male,59,1.69,43,80000,Spanish,SP,Spain,Falcon,haXe,GNU Octave,Aaron,Kirby


In [35]:
cur.execute("""
    INSERT INTO Person(person_first, person_last, age, height) 
    VALUES 
    ('Aaron', 'Alexander', 54, 1.70),
    ('Aaron', 'Kirby',     59, 1.69);
    """)

<sqlite3.Cursor at 0x7f6971115ce0>

In [51]:
pd.read_sql_query("""SELECT * FROM Person""", con = con)
#df = pd.DataFrame(cur.fetchall())
#display(df)
#cur.fetchall()

Unnamed: 0,person_id,person_first,person_last,age,height
0,1,Aaron,Alexander,54,1.7
1,2,Aaron,Kirby,59,1.69


In [46]:
cur.execute("""
    SELECT person_id AS id, person_first AS first
    FROM Person 
    """)

df = pd.DataFrame(cur.fetchall())
display(df)

Unnamed: 0,0,1
0,1,Aaron
1,2,Aaron


In [43]:
df.to_sql('users', con = con)

In [44]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Person',), ('users',)]


In [56]:
tmp = pd.read_sql_query("""SELECT * FROM Person""", con = con)
tmp.to_sql('users', con = con, if_exists='replace')

cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

pd.read_sql_query("""SELECT * FROM Users""", con = con)

[('Person',), ('users',)]


Unnamed: 0,index,person_id,person_first,person_last,age,height
0,0,1,Aaron,Alexander,54,1.7
1,1,2,Aaron,Kirby,59,1.69


-----

**2**. 25 points

We want to find potential mentors for Abram	Boyer. Find all faculty members who know one or more of the same languages as Christopher Robbins whose salary is at least $50,000 higher than his using SQL statements. Assume that the only information you have is that you need to find mentors meeting the criteria for the faculty member named `Abram Boyer`. In other words, the ONLY hard coded terms in your SQL query are `Abram` and `Boyer` and the salary differential.

You can use the `sql` magic extension or the `sqlite3` driver for this question.