# set environment

In [4]:
### basic tools
import numpy as np
import pandas as pd

### show or plot
from IPython.display import display, HTML

### database
import sqlite3

### directory
datadir = "../data/"
scratch_datadir = "./scratch_data/"

### helper function
head = lambda x: x[:6]
tail = lambda x: x[-6:]

Note: Reference about using pandas

- rename
    - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html
- sql API
    - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html#pandas.DataFrame.to_sql
    - https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_query.html#pandas.read_sql_query

# Read in a data

In [3]:
flat = pd.read_csv(datadir + 'flat.csv', keep_default_na = False)

print(flat.shape)
flat.sample(10)

(1523, 14)


Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
1183,Renato Rose,Male,32,1.76,86,111000,Polish,PO,Poland,,,,Renato,Rose
112,Augustine Dodson,Male,64,1.69,56,91000,Costa Rican,CR,Costa Rica,Scala,Racket,Z shell,Augustine,Dodson
1036,Modesto Ashley,Male,20,1.52,40,97000,Greek,GR,Greece,Kotlin,Tcl,Lua,Modesto,Ashley
295,Daron Charles,Male,36,1.9,90,121000,British,GB,United Kingdom,,,,Daron,Charles
80,Apolonia Nielsen,Female,33,1.53,83,88000,Russian,RU,Russia,Caml,,,Apolonia,Nielsen
470,Ezequiel Dejesus,Male,62,1.9,82,70000,Chinese,CH,China,PHP,,,Ezequiel,Dejesus
1411,Tracy Ramos,Male,48,1.59,52,93000,Mexican,ME,Mexico,PL-SQL,Elixir,,Tracy,Ramos
571,Hayden Daniel,Male,64,1.75,73,123000,Estonian,EE,Estonia,,,,Hayden,Daniel
355,Devora Payne,Female,60,1.99,55,146000,Chinese,HK,Hong Kong,,,,Devora,Payne
1141,Porfirio Dudley,Male,40,1.95,49,76000,British,GB,United Kingdom,Ruby,Smalltalk,Assembly,Porfirio,Dudley


Note that there is not duplication in name!!

In [59]:
np.sum(flat.name.duplicated())

0

# Create a database by connecting to one

In [5]:
# Connecting to the database file
con = sqlite3.connect(scratch_datadir + "faculty.db")
cur = con.cursor()

In [6]:
!!ls ./scratch_data

['faculty.db']

# Start to input the data

In [8]:
cur.execute("DROP TABLE IF EXISTS Person;")
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[]


create a table called Person

In [9]:
cur.execute("""
    CREATE TABLE Person(
    person_id INTEGER PRIMARY KEY,
    person_first varchar(255),
    person_last  varchar(255),
    age          INTEGER,    
    height       FLOAT(3),
    weight       INTEGER
    );""")

<sqlite3.Cursor at 0x7ff1740b4f80>

check if the table is created

In [10]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Person',)]


insert and query the data

In [12]:
flat.iloc[0:2, :]

Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
0,Aaron Alexander,Male,54,1.7,90,151000,British,GB,United Kingdom,Haskell,,,Aaron,Alexander
1,Aaron Kirby,Male,59,1.69,43,80000,Spanish,SP,Spain,Falcon,haXe,GNU Octave,Aaron,Kirby


In [13]:
cur.execute("""
    INSERT INTO Person(person_first, person_last, age, height, weight) 
    VALUES 
    ('Aaron', 'Alexander', 54, 1.70, 90),
    ('Aaron', 'Kirby',     59, 1.69, 43);
    """)

<sqlite3.Cursor at 0x7ff1740b4f80>

In [14]:
cur.execute("""
    SELECT * FROM Person 
    """)
print(cur.fetchall())

[(1, 'Aaron', 'Alexander', 54, 1.7, 90), (2, 'Aaron', 'Kirby', 59, 1.69, 43)]


# Try using the SQL API *read_sql* & *to_sql* in Pandas

In [23]:
df = (flat
      .loc[:, ('first', 'last', 'age', 'height', 'weight')]
      .drop_duplicates()
      .head()
)

display(df)

Unnamed: 0,first,last,age,height,weight
0,Aaron,Alexander,54,1.7,90
1,Aaron,Kirby,59,1.69,43
2,Abram,Allen,41,1.7,44
3,Abram,Boyer,45,1.64,68
4,Adaline,Barry,54,1.87,58


write the dataframe into sql

In [24]:
df.to_sql('Person', con = con, if_exists='replace')

check if the data is input correctly

In [25]:
pd.read_sql_query("""SELECT * FROM Person""", con = con)

Unnamed: 0,index,first,last,age,height,weight
0,0,Aaron,Alexander,54,1.7,90
1,1,Aaron,Kirby,59,1.69,43
2,2,Abram,Allen,41,1.7,44
3,3,Abram,Boyer,45,1.64,68
4,4,Adaline,Barry,54,1.87,58


# Try to relate two tables using foreign key

Note that [the pandas to_sql does not provide primary key setting options](https://www.reddit.com/r/Python/comments/45spup/using_sqlalchemy_and_pandas_to_create_a_database/)
```
Which would have worked perfectly, except for one problem, automap requires the tables to have a primary key. Ok, no problem, I'm sure Pandas to_sql has a way to indicate the primary key... nope. This is where it gets a little hacky:
```

dataset with person and its nationality

In [109]:
df = (flat
      .loc[:, ('first', 'last', 'age', 'height', 'weight', 'code', 'country', 'nationality')]
      .assign(pid = lambda x: x['first'] + x['last'] + x.age.astype(str))
      .drop_duplicates()
      .head(10)
      .rename(index = str, columns = {"code": "cid"})
)

display(df)

Unnamed: 0,first,last,age,height,weight,cid,country,nationality,pid
0,Aaron,Alexander,54,1.7,90,GB,United Kingdom,British,AaronAlexander54
1,Aaron,Kirby,59,1.69,43,SP,Spain,Spanish,AaronKirby59
2,Abram,Allen,41,1.7,44,IT,Italy,Italian,AbramAllen41
3,Abram,Boyer,45,1.64,68,IT,Italy,Italian,AbramBoyer45
4,Adaline,Barry,54,1.87,58,UY,Uruguay,Uruguayan,AdalineBarry54
5,Adam,Lawrence,54,1.7,63,CM,Cambodia,Cambodian,AdamLawrence54
6,Adam,Rush,38,1.87,49,CM,Cameroon,Cameroonian,AdamRush38
7,Adan,Brown,49,1.81,68,BE,Belgium,Belgian,AdanBrown49
8,Adelle,Duffy,27,1.63,59,FR,France,French,AdelleDuffy27
9,Adena,Holland,18,1.59,63,IT,Italy,Italian,AdenaHolland18


create three df for database

In [69]:
df_person  = (df
              .loc[:, ('pid', 'first', 'last', 'age', 'height', 'weight', 'cid')]
              .drop_duplicates())

df_country = (df
              .loc[:, ('cid', 'country', 'nationality')]
              .drop_duplicates())

display(df_person)
display(df_country)

Unnamed: 0,pid,first,last,age,height,weight,cid
0,AaronAlexander54,Aaron,Alexander,54,1.7,90,GB
1,AaronKirby59,Aaron,Kirby,59,1.69,43,SP
2,AbramAllen41,Abram,Allen,41,1.7,44,IT
3,AbramBoyer45,Abram,Boyer,45,1.64,68,IT
4,AdalineBarry54,Adaline,Barry,54,1.87,58,UY
5,AdamLawrence54,Adam,Lawrence,54,1.7,63,CM
6,AdamRush38,Adam,Rush,38,1.87,49,CM
7,AdanBrown49,Adan,Brown,49,1.81,68,BE
8,AdelleDuffy27,Adelle,Duffy,27,1.63,59,FR
9,AdenaHolland18,Adena,Holland,18,1.59,63,IT


Unnamed: 0,cid,country,nationality
0,GB,United Kingdom,British
1,SP,Spain,Spanish
2,IT,Italy,Italian
4,UY,Uruguay,Uruguayan
5,CM,Cambodia,Cambodian
6,CM,Cameroon,Cameroonian
7,BE,Belgium,Belgian
8,FR,France,French


set person and country tables

```
DROP TABLE IF EXISTS Country;
DROP TABLE IF EXISTS Person;

CREATE TABLE Country (
    country_id varcarh(2) PRIMARY KEY,
    country_name varchar(255)
);

CREATE TABLE Person (
    person_id INTEGER PRIMARY KEY,
    person_first varchar(255),
    person_last varchar(255),
    country_id INTEGER NOT NULL,
      FOREIGN KEY (country_id) REFERENCES Country(country_id)
);
```

In [91]:
def query_insert(table, cols, values):
    """Helper function to create insert query from table name (table), column names (cols), and observations (values)
    
    >>> query = query_insert("Person", ("x", "y"), [(1, 2, 'a'), (10, 20, 'b')])
    >>> print(query)
    INSERT INTO Person ('x', 'y')
    VALUES
    (1, 2, 'a'),
    (10, 20, 'b');
    """
    # initialize an insert query
    query = "INSERT INTO {table_name} {list_of_variables}\nVALUES".format(table_name = table, list_of_variables = cols)
    
    # add rows into query
    tmp = map(str, values)
    tmp = ",\n".join(tmp) + ";"
    
    return query + "\n" + tmp

### test the query_insert function
print(query_insert("Person", ("x", "y"), [(1, 2, 'a'), (10, 20, 'b')]))

INSERT INTO Person ('x', 'y')
VALUES
(1, 2, 'a'),
(10, 20, 'b');


In [97]:
flat.loc[0:1, :]

Unnamed: 0,name,gender,age,height,weight,salary,nationality,code,country,language1,language2,language3,first,last
0,Aaron Alexander,Male,54,1.7,90,151000,British,GB,United Kingdom,Haskell,,,Aaron,Alexander
1,Aaron Kirby,Male,59,1.69,43,80000,Spanish,SP,Spain,Falcon,haXe,GNU Octave,Aaron,Kirby


In [118]:
cur.execute("DROP TABLE IF EXISTS Country;")
cur.execute("DROP TABLE IF EXISTS Person;")

query = """
    CREATE TABLE Country (
        country_id          varchar(2) PRIMARY KEY,
        country_name        varchar(255),
        country_nationality varchar(255)
);"""

cur.execute(query)

query = """
    CREATE TABLE Person(
        person_id    varchar(255) PRIMARY KEY,
        person_first varchar(255),
        person_last  varchar(255),
        
        age          INTEGER,    
        height       FLOAT(3),
        weight       INTEGER,
        
        country_id INTEGER NOT NULL,
            FOREIGN KEY (country_id) REFERENCES Country(country_id)
);"""

cur.execute(query)

cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('Country',), ('Person',)]


In [112]:
df.iloc[0:2, :]

Unnamed: 0,first,last,age,height,weight,cid,country,nationality,pid
0,Aaron,Alexander,54,1.7,90,GB,United Kingdom,British,AaronAlexander54
1,Aaron,Kirby,59,1.69,43,SP,Spain,Spanish,AaronKirby59


In [119]:
query = query_insert(
    "Country", 
    ("country_id", "country_name", "country_nationality"), 
    [("GB", "United Kingdom", "British"), 
     ("SP", "Spanish",        "Spain")])

cur.execute(query)
print(query)

INSERT INTO Country ('country_id', 'country_name', 'country_nationality')
VALUES
('GB', 'United Kingdom', 'British'),
('SP', 'Spanish', 'Spain');


In [120]:
query = query_insert(
    "Person", 
    ("person_id",         "person_first", "person_last", "age", "height", "weight", "country_id"),
    [("AaronAlexander54", "Aaron",        "Alexander",   54,    1.70,     90,       "GB"),
     ("AaronKirby59",     "Aaron",        "Kirby",       59,    1.69,     43,       "SP")])

cur.execute(query)
print(query)

INSERT INTO Person ('person_id', 'person_first', 'person_last', 'age', 'height', 'weight', 'country_id')
VALUES
('AaronAlexander54', 'Aaron', 'Alexander', 54, 1.7, 90, 'GB'),
('AaronKirby59', 'Aaron', 'Kirby', 59, 1.69, 43, 'SP');


query

In [122]:
cur.execute("""
    SELECT * FROM Person
    """)
cur.fetchall()

[('AaronAlexander54', 'Aaron', 'Alexander', 54, 1.7, 90, 'GB'),
 ('AaronKirby59', 'Aaron', 'Kirby', 59, 1.69, 43, 'SP')]

In [123]:
pd.read_sql_query("""SELECT * FROM Person""", con = con)

Unnamed: 0,person_id,person_first,person_last,age,height,weight,country_id
0,AaronAlexander54,Aaron,Alexander,54,1.7,90,GB
1,AaronKirby59,Aaron,Kirby,59,1.69,43,SP


query both together

# Linker