In [1]:
# advanced_dataframes_exercises

In [2]:
import numpy as np
import pandas as pd

In [3]:
from env import host, user, password

In [8]:
url = f'mysql+pymysql://{user}:{password}@{host}/employees'


In [9]:
pd.read_sql('SELECT * FROM employees LIMIT 5 OFFSET 50', url)


Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10051,1953-07-28,Hidefumi,Caine,M,1992-10-15
1,10052,1961-02-26,Heping,Nitsch,M,1988-05-21
2,10053,1954-09-13,Sanjiv,Zschoche,F,1986-02-04
3,10054,1957-04-04,Mayumi,Schueller,M,1995-03-13
4,10055,1956-06-06,Georgy,Dredge,M,1992-04-27


In [10]:
sql = '''
SELECT
    emp_no,
    first_name,
    last_name
FROM employees
WHERE gender = 'F'
LIMIT 100
'''

In [11]:
pd.read_sql(sql,url)

Unnamed: 0,emp_no,first_name,last_name
0,10002,Bezalel,Simmel
1,10006,Anneke,Preusig
2,10007,Tzvetan,Zielinski
3,10009,Sumant,Peac
4,10010,Duangkaew,Piveteau
...,...,...,...
95,10245,Ramalingam,Gente
96,10247,Heon,Riefers
97,10248,Frederique,Tempesti
98,10252,Shirish,Wegerle


In [12]:
query = '''
SELECT
    t.title as title,
    d.dept_name as dept_name
FROM titles t
JOIN dept_emp USING (emp_no)
JOIN departments d USING (dept_no)
LIMIT 100
'''

title_dept = pd.read_sql(query, url)
title_dept.head()


Unnamed: 0,title,dept_name
0,Staff,Customer Service
1,Senior Staff,Customer Service
2,Staff,Customer Service
3,Senior Staff,Customer Service
4,Staff,Customer Service


In [13]:
def get_db_url(user, host, password, db_name):
    return f'mysql+pymysql://{user}:{password}@{host}/{db_name}'

In [19]:
connection_string = get_db_url(user, host, password, 'employees')

In [20]:
pd.read_sql(query, connection_string)


Unnamed: 0,title,dept_name
0,Staff,Customer Service
1,Senior Staff,Customer Service
2,Staff,Customer Service
3,Senior Staff,Customer Service
4,Staff,Customer Service
...,...,...
95,Senior Staff,Customer Service
96,Staff,Customer Service
97,Senior Staff,Customer Service
98,Senior Staff,Customer Service


In [21]:
# If I intentionally make an error in the connection string (in this case, 
# I misspelled the database name), I get this error when I run the sql query:
# OperationalError: (pymysql.err.OperationalError) (1044, "Access denied for 
# user 'pagel_2179'@'%' to database 'employes'")

connection_string = get_db_url(user, host, password, 'employes')
pd.read_sql(query, connection_string)

In [24]:
# If I intentionally make an error in the SQL query (in this case,
# I removed the comma after title in the SELECT statement), I get this error:
# ProgrammingError: (pymysql.err.ProgrammingError) (1064, "You have an error 
# in your SQL syntax; ...
# The error looks very similar to the error that mysql would give if I 
# ran it in MySQLWorkBench

connection_string = get_db_url(user, host, password, 'employees')
error_query = '''
SELECT
    t.title as title
    d.dept_name as dept_name
FROM titles t
JOIN dept_emp USING (emp_no)
JOIN departments d USING (dept_no)
LIMIT 100
'''
pd.read_sql(error_query, connection_string)

In [25]:
# Read the employees and titles tables into two separate DataFrames.

employees_db_connect_string = get_db_url(user, host, password, 'employees')
employees_query = '''
    SELECT *
    FROM employees
'''
titles_query = '''
    SELECT *
    FROM titles
'''


In [26]:
employees_df = pd.read_sql(employees_query, employees_db_connect_string)

In [28]:
titles_df = pd.read_sql(titles_query, employees_db_connect_string)

In [27]:
employees_df

Unnamed: 0,emp_no,birth_date,first_name,last_name,gender,hire_date
0,10001,1953-09-02,Georgi,Facello,M,1986-06-26
1,10002,1964-06-02,Bezalel,Simmel,F,1985-11-21
2,10003,1959-12-03,Parto,Bamford,M,1986-08-28
3,10004,1954-05-01,Chirstian,Koblick,M,1986-12-01
4,10005,1955-01-21,Kyoichi,Maliniak,M,1989-09-12
...,...,...,...,...,...,...
300019,499995,1958-09-24,Dekang,Lichtner,F,1993-01-12
300020,499996,1953-03-07,Zito,Baaz,M,1990-09-27
300021,499997,1961-08-03,Berhard,Lenart,M,1986-04-21
300022,499998,1956-09-05,Patricia,Breugel,M,1993-10-13


In [29]:
titles_df

Unnamed: 0,emp_no,title,from_date,to_date
0,10001,Senior Engineer,1986-06-26,9999-01-01
1,10002,Staff,1996-08-03,9999-01-01
2,10003,Senior Engineer,1995-12-03,9999-01-01
3,10004,Engineer,1986-12-01,1995-12-01
4,10004,Senior Engineer,1995-12-01,9999-01-01
...,...,...,...,...
443303,499997,Engineer,1987-08-30,1992-08-29
443304,499997,Senior Engineer,1992-08-29,9999-01-01
443305,499998,Senior Staff,1998-12-27,9999-01-01
443306,499998,Staff,1993-12-27,1998-12-27


In [32]:
# How many rows and columns do you have in each DataFrame? Is that what you expected?

# employees_df had 300024 rows x 6 columns
# titles_df had 443308 rows x 4 columns
employees_df.shape
titles_df.shape

(443308, 4)

In [36]:
# Display the summary statistics for each DataFrame.

employees_df.info()
#employees_df.describe()
titles_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300024 entries, 0 to 300023
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   emp_no      300024 non-null  int64 
 1   birth_date  300024 non-null  object
 2   first_name  300024 non-null  object
 3   last_name   300024 non-null  object
 4   gender      300024 non-null  object
 5   hire_date   300024 non-null  object
dtypes: int64(1), object(5)
memory usage: 13.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 443308 entries, 0 to 443307
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   emp_no     443308 non-null  int64 
 1   title      443308 non-null  object
 2   from_date  443308 non-null  object
 3   to_date    443308 non-null  object
dtypes: int64(1), object(3)
memory usage: 13.5+ MB


In [38]:
# How many unique titles are in the titles DataFrame?

print("The number of unique titles in the titles DataFrame = ")
titles_df['title'].nunique()

The number of unique titles in the titles DataFrame = 


7

In [43]:
# What is the oldest date in the to_date column?

print("The oldest date in the to_date columns are: ")
print(titles_df['to_date'].min())

The oldest date in the to_date columns are: 
1985-03-01


In [44]:
# What is the most recent date in the to_date column?

print("The most recent date in the to_date columns are: ")
print(titles_df['to_date'].max())



The most recent date in the to_date columns are: 
9999-01-01


In [80]:
# The most recent date in the to_date column that is not 9999-01-01 ?

#print(type(titles_df['to_date'][1]))
test_date = titles_df['to_date'][1]
test_date
# test_string = test_date.strftime("%Y-%m-%d")
# test_string

datetime.date(9999, 1, 1)