## Scrape data from Lep website

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from seleniumwire import webdriver as wire_webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException

import pandas as pd
import time
from datetime import datetime, date
import dbm

In [2]:
# Declare browser service
s = Service('C://Users//hantt3//chromedriver_win32//chromedriver.exe')

# Configure Chrome options
options = Options()
options.add_argument('--ignore-ssl-errors=yes')
options.add_argument('--ignore-certificate-errors')

# Create the webdriver instance
driver = wire_webdriver.Chrome(service=s, options=options)


In [21]:
def lepProductInfo(link):
    '''The lepProductInfo function accepts a link parameter and 
    is responsible for scraping product information from the specified URL.'''
    driver.get(link) #opening the URL using the web driver
    count = 1 
    product_name, price = [], []
    
    while True: #while loop continues until explicitly broken
        try:
            names = driver.find_elements(By.CSS_SELECTOR, '.item_info-left__2V7dL .item_title__2r6lQ')
            product_name += [s.text for s in names]
            
            prices = driver.find_elements(By.CSS_SELECTOR, '.item_sale-price__3bTSG')
            price += [s.text for s in prices]
            
            next_pagination = driver.find_element(By.CSS_SELECTOR, '.pagniation_btn-next__3_ivR') #locates the next page button 
            
            # Check if the next page button is disabled
            if "disabled" in next_pagination.get_attribute("class"):
                break  # Exit the loop if the button is disabled
                
            next_pagination.click()
            print("Clicked on button next page!")
            time.sleep(5)
            count += 1
            if count ==9:
                break
        except ElementClickInterceptedException:
            print("ElementClickInterceptedException!")
            break
    
    lep_data = pd.DataFrame(list(zip(product_name, price)), columns=['product_title', 'price']) 
    return lep_data

url = 'https://lep.vn/category/dam-i.104?'
df = lepProductInfo(url)

Clicked on button next page!
Clicked on button next page!
Clicked on button next page!
Clicked on button next page!
Clicked on button next page!
Clicked on button next page!
Clicked on button next page!
Clicked on button next page!


In [22]:
df.tail()

Unnamed: 0,product_title,price
155,Áo khoác dạ dáng lửng 1AK047DE,1.250.000 đ
156,Váy 2 dây ngực nhún 1VA02071XT,750.000 đ
157,Váy nhún chân bèo 3VA1314BE,795.000 đ
158,Set blazer tulip1VA2173HO,1.150.000 đ
159,Mira Dress 6VA251HO,750.000 đ


From the product title, we can extract its title and product ID which will act as the primary key for the products table Let's extract. 

In [23]:
#Using str.extract() with regular expressions matches and extracts sequences of uppercase letters and digits at least 6 characters long
df['product_id'] = df['product_title'].str.extract(r'([A-Z0-9]{6,})') 

In [24]:
df.head()

Unnamed: 0,product_title,price,product_id
0,Váy nhún ngực xẻ đùi 1VA02010XL,750.000 đ,1VA02010XL
1,Váy wrap bút chì 1VA02002XL,695.000 đ,1VA02002XL
2,Váycổ vuông hở bụng 1VA02050XL,695.000 đ,1VA02050XL
3,váy hoa nhí cúc trai 1VA01811BE,795.000 đ,1VA01811BE
4,Áo yếm bèo xòe 1AO0178CA,495.000 đ,1AO0178CA


In [25]:
# removes any non-numeric characters and retains the decimal point
df['price'] = df['price'].str.replace(r'[^0-9.]', '', regex=True)
#  removes the remaining thousands separators 
df['price'] = df['price'].str.replace(r'\.', '', regex=True).astype(float)

In [26]:
df.head()

Unnamed: 0,product_title,price,product_id
0,Váy nhún ngực xẻ đùi 1VA02010XL,750000.0,1VA02010XL
1,Váy wrap bút chì 1VA02002XL,695000.0,1VA02002XL
2,Váycổ vuông hở bụng 1VA02050XL,695000.0,1VA02050XL
3,váy hoa nhí cúc trai 1VA01811BE,795000.0,1VA01811BE
4,Áo yếm bèo xòe 1AO0178CA,495000.0,1AO0178CA


In [27]:
#Check the uniqueness of the column product_id cause it will be the primary key of the table product in database.

In [28]:
df['product_id'].duplicated().sum()

5

In [29]:
#Keep only the unique values in 'product_id' and drop duplicate rows
df_unique = df.drop_duplicates(subset='product_id')

In [30]:
df_unique['product_id'].duplicated().sum()

0

In [31]:
len(df_unique)

155

In [36]:
type(df_unique)

pandas.core.frame.DataFrame

In [37]:
df_unique.head()

Unnamed: 0,product_title,price,product_id
0,Váy nhún ngực xẻ đùi 1VA02010XL,750000.0,1VA02010XL
1,Váy wrap bút chì 1VA02002XL,695000.0,1VA02002XL
2,Váycổ vuông hở bụng 1VA02050XL,695000.0,1VA02050XL
3,váy hoa nhí cúc trai 1VA01811BE,795000.0,1VA01811BE
4,Áo yếm bèo xòe 1AO0178CA,495000.0,1AO0178CA


In [40]:
print(len(df_unique.index))

155


In [41]:
df_unique = df_unique.reset_index(drop=True)

In [46]:
df_unique.to_csv('lep_product.csv') #save it as a csv file for future use

## Generate data

Use Python to generate random data about employees, customers, payments.

In [15]:
import numpy as np
from faker import Faker
import pandas as pd
import random

In [16]:

# Create Faker object
fake = Faker()

# Set random seed for reproducibility
random.seed(123)

# Generate random employee data
employee_data = []
for i in range(200):
    employee_id = fake.unique.random_number(digits=6)
    first_name = fake.first_name()
    last_name = fake.last_name()
    full_name = f"{first_name} {last_name}"
    birthday = fake.date_of_birth(minimum_age=18, maximum_age=50)
    email = f"{first_name.lower()}{last_name.lower()[0]}@gmail.com"
    phone_number = fake.phone_number()
    gender = random.choice(['Male', 'Female'])
    department = random.choice(['IT', 'Sale', 'Finance', 'HR'])
    marital_status = random.choice(['Single', 'Married'])
    salary = random.randint(30000, 100000)
    hire_date = fake.date_between(start_date='-3y', end_date='today')
    
    employee_data.append((employee_id, first_name, last_name, full_name, birthday, email, phone_number, gender, department,
                          marital_status, salary, hire_date))

# Create DataFrame
columns = ['employee_id', 'first_name', 'last_name', 'full_name', 'birthday', 'email', 'employee_phone_number', 'gender', 'department',
         'marital_status', 'salary', 'hire_date']
employee = pd.DataFrame(employee_data, columns=columns)


In [17]:
employee.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,birthday,email,employee_phone_number,gender,department,marital_status,salary,hire_date
0,560826,Cathy,Webb,Cathy Webb,1993-05-10,cathyw@gmail.com,001-714-489-0170x7510,Male,Finance,Single,83377,2020-09-17
1,646855,Isabella,Valenzuela,Isabella Valenzuela,1982-12-16,isabellav@gmail.com,(690)730-4104x5519,Female,IT,Single,79692,2023-04-28
2,832469,Margaret,Dennis,Margaret Dennis,1980-07-21,margaretd@gmail.com,(597)862-0752x72930,Female,Finance,Single,50927,2021-12-06
3,298776,William,Graham,William Graham,1991-08-10,williamg@gmail.com,(226)371-9683x9613,Male,Finance,Married,62134,2021-08-31
4,68629,Mark,Wood,Mark Wood,1973-11-24,markw@gmail.com,384-599-2390x497,Male,IT,Married,41483,2021-06-02


In [43]:
employee.to_csv('lep_employee.csv')

## Generating Customer Data

In [18]:

from faker import Faker
import random
import pandas as pd

# Create Faker object
fake = Faker()

# Set random seed for reproducibility
random.seed(123)

# Generate random customer data
customer_data = []
for i in range(1000):
    customer_id = fake.unique.random_number(digits=7)
    first_name = fake.first_name()
    last_name = fake.last_name()
    full_name = f"{first_name} {last_name}"
    birthday = fake.date_of_birth(minimum_age=18, maximum_age=80)
    email = f"{first_name.lower()}{last_name.lower()[0]}@gmail.com"
    gender = random.choice(['Male', 'Female'])
    country = fake.country()
    city = fake.city()
    street = fake.street_address()
    
    customer_data.append((customer_id, first_name, last_name, full_name, birthday, email, gender, country, city, street))

# Create DataFrame
columns = ['customer_id', 'first_name', 'last_name', 'full_name', 'birthday', 'email', 'gender', 'country', 'city', 'street']
customer = pd.DataFrame(customer_data, columns=columns)

In [19]:
customer.head()

Unnamed: 0,customer_id,first_name,last_name,full_name,birthday,email,gender,country,city,street
0,8837775,Adrian,Woods,Adrian Woods,1992-12-24,adrianw@gmail.com,Male,Lesotho,Josephbury,9559 Rodriguez Haven Apt. 014
1,8345899,Kristen,Patrick,Kristen Patrick,1970-04-09,kristenp@gmail.com,Female,Italy,West Angelaside,2779 David Knoll Suite 911
2,3762240,David,Murphy,David Murphy,1959-05-19,davidm@gmail.com,Male,Portugal,Laurenfort,899 Curtis Station
3,2547658,Charles,Brown,Charles Brown,1998-02-27,charlesb@gmail.com,Female,Botswana,South Huntermouth,523 Mathew Road
4,5324229,Kelsey,Knight,Kelsey Knight,1961-02-03,kelseyk@gmail.com,Female,Niger,North Latoyachester,711 Bianca Drives


In [44]:
customer.to_csv('lep_customer.csv')

##  Generating Payments Data

In [42]:
import random
import pandas as pd
from faker import Faker

# Set random seed for reproducibility
random.seed(123)

# Create Faker object
fake = Faker()

# Generate random payment data
payment_data = []
for i in range(2000):
    payment_id = fake.unique.random_number(digits=9)
    payment_date = fake.date_between(start_date='-3y', end_date='today')
    customer_id = random.choice(customer['customer_id'])
    employee_id = random.choice(employee['employee_id'])
    product_id = random.choice(df_unique['product_id'])
    
    payment_data.append((payment_id, payment_date, customer_id, employee_id, product_id))

# Create DataFrame
columns = ['payment_id', 'payment_date', 'customer_id', 'employee_id', 'product_id']
payment = pd.DataFrame(payment_data, columns=columns)


In [45]:
payment.to_csv("lep_payment.csv")

## Manipulate data

In [3]:
import numpy as np
import pandas as pd

import pandasql as ps

def sql(query):
    return ps.sqldf(query)

In [2]:
pip install pandasql

Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py): started
  Building wheel for pandasql (setup.py): finished with status 'done'
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26800 sha256=da304354c387f9e1e865273dbbf5fec5e088643e192f13c3412c28bafb0919c3
  Stored in directory: c:\users\hantt3\appdata\local\pip\cache\wheels\4c\a4\ce\ad31f7d9f6bfce32741955de12e76e3a6e7181c20321db3c66
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3
Note: you may need to restart the kernel to use updated packages.




In [10]:
product = pd.read_csv("lep_product.csv", index_col=0)

In [11]:
product.head()

Unnamed: 0,product_title,price,product_id
0,Váy nhún ngực xẻ đùi 1VA02010XL,750000.0,1VA02010XL
1,Váy wrap bút chì 1VA02002XL,695000.0,1VA02002XL
2,Váycổ vuông hở bụng 1VA02050XL,695000.0,1VA02050XL
3,váy hoa nhí cúc trai 1VA01811BE,795000.0,1VA01811BE
4,Áo yếm bèo xòe 1AO0178CA,495000.0,1AO0178CA


In [13]:
employee = pd.read_csv('lep_employee.csv',index_col=0)

In [14]:
employee.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,birthday,email,employee_phone_number,gender,department,marital_status,salary,hire_date
0,560826,Cathy,Webb,Cathy Webb,1993-05-10,cathyw@gmail.com,001-714-489-0170x7510,Male,Finance,Single,83377,2020-09-17
1,646855,Isabella,Valenzuela,Isabella Valenzuela,1982-12-16,isabellav@gmail.com,(690)730-4104x5519,Female,IT,Single,79692,2023-04-28
2,832469,Margaret,Dennis,Margaret Dennis,1980-07-21,margaretd@gmail.com,(597)862-0752x72930,Female,Finance,Single,50927,2021-12-06
3,298776,William,Graham,William Graham,1991-08-10,williamg@gmail.com,(226)371-9683x9613,Male,Finance,Married,62134,2021-08-31
4,68629,Mark,Wood,Mark Wood,1973-11-24,markw@gmail.com,384-599-2390x497,Male,IT,Married,41483,2021-06-02


In [15]:
customer = pd.read_csv('lep_customer.csv', index_col=0)

In [19]:
customer.head()

Unnamed: 0,customer_id,first_name,last_name,full_name,birthday,email,gender,country,city,street
0,8837775,Adrian,Woods,Adrian Woods,1992-12-24,adrianw@gmail.com,Male,Lesotho,Josephbury,9559 Rodriguez Haven Apt. 014
1,8345899,Kristen,Patrick,Kristen Patrick,1970-04-09,kristenp@gmail.com,Female,Italy,West Angelaside,2779 David Knoll Suite 911
2,3762240,David,Murphy,David Murphy,1959-05-19,davidm@gmail.com,Male,Portugal,Laurenfort,899 Curtis Station
3,2547658,Charles,Brown,Charles Brown,1998-02-27,charlesb@gmail.com,Female,Botswana,South Huntermouth,523 Mathew Road
4,5324229,Kelsey,Knight,Kelsey Knight,1961-02-03,kelseyk@gmail.com,Female,Niger,North Latoyachester,711 Bianca Drives


In [16]:
payment = pd.read_csv('lep_payment.csv', index_col = 0)

In [17]:
# Adding the price column to the table payment
query = '''
select p1.*, p2.price as price
from payment p1
inner join product p2
on p1.product_id = p2.product_id
'''

payment = sql(query)

In [18]:
payment.head()

Unnamed: 0,payment_id,payment_date,customer_id,employee_id,product_id,price
0,389032931,2023-03-23,8946605,511819,1VA01927XT,695000.0
1,139073443,2022-10-06,3082219,811126,1VA02121XT,850000.0
2,957908613,2021-08-03,3016036,452330,1VA01855TI,695000.0
3,918427211,2023-03-16,2669326,147602,1VA01837D,750000.0
4,268262738,2022-03-09,7284948,242020,1VA02038HO,850000.0


## Normalizing the Data

### Customer table

In [29]:
# Creating a new table called countries_df
countries_df = customer[['country']].copy().drop_duplicates() #copy() method is used on the selected column to create a new DataFrame
countries_df['country_id'] = range(len(countries_df))
countries_df = countries_df.reset_index(drop=True)
countries_df.head()

Unnamed: 0,country,country_id
0,Lesotho,0
1,Italy,1
2,Portugal,2
3,Botswana,3
4,Niger,4


In [33]:
query = '''
SELECT countries_df.country_id
FROM customer
JOIN countries_df ON customer.country = countries_df.country
'''

country_ids = sql(query)

# Connecting countries to customers by adding the foreign key: country_id
customer['country_id'] = country_ids

In [35]:
customer.tail()

Unnamed: 0,customer_id,first_name,last_name,full_name,birthday,email,gender,country,city,street,country_id
995,6830701,Phillip,Duran,Phillip Duran,1943-10-22,phillipd@gmail.com,Female,Turkmenistan,Charlestown,532 Morrison Ford Apt. 576,164
996,5045277,David,Rodriguez,David Rodriguez,1979-10-20,davidr@gmail.com,Female,Latvia,Lake Kimberlyport,8460 Mcfarland Village Apt. 545,78
997,1928291,Mary,Hernandez,Mary Hernandez,1946-06-25,maryh@gmail.com,Female,Bahrain,South Heatherborough,76444 Travis Street,35
998,12679,Robert,Lee,Robert Lee,1956-03-18,robertl@gmail.com,Male,Mauritius,Jimtown,956 Michelle Ways Suite 368,151
999,8624653,Leslie,Schaefer,Leslie Schaefer,1970-04-03,leslies@gmail.com,Female,Guam,Castillohaven,139 Jessica Roads Apt. 030,69


In [36]:
customer[customer['country']=='Bahrain']

Unnamed: 0,customer_id,first_name,last_name,full_name,birthday,email,gender,country,city,street,country_id
39,6099454,Michael,Allison,Michael Allison,1953-04-23,michaela@gmail.com,Female,Bahrain,West Johnathan,27289 Jennifer Circles,35
200,5123497,Jerry,Jordan,Jerry Jordan,1992-11-08,jerryj@gmail.com,Male,Bahrain,Jarvisport,885 William Springs,35
997,1928291,Mary,Hernandez,Mary Hernandez,1946-06-25,maryh@gmail.com,Female,Bahrain,South Heatherborough,76444 Travis Street,35


In [37]:
# Dropping the column country
customer = customer.drop(['country'],axis=1)

The gender column in the customer table is functionally dependent on the primary key and there are no other non-key attributes dependent on gender and there are no other non-key attributes dependent on gender, creating a separate table may not be necessary. 

### Employee Table

In [38]:
department_df = employee[['department']].copy().drop_duplicates() #copy() method is used on the selected column to create a new DataFrame
department_df['department_id'] = range(len(department_df))
department_df = department_df.reset_index(drop=True)
department_df.head()

Unnamed: 0,department,department_id
0,Finance,0
1,IT,1
2,Sale,2
3,HR,3


In [43]:
# retrieves the matching department_id values from the department_df table
query = '''
select department_df.department_id
from employee 
join department_df
on 
    employee.department = department_df.department
'''


department_ids = sql(query)

# Connecting countries to customers by adding the foregin key: country_id
employee['department_id'] = department_ids

In [44]:
employee.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,birthday,email,employee_phone_number,gender,department,marital_status,salary,hire_date,department_id
0,560826,Cathy,Webb,Cathy Webb,1993-05-10,cathyw@gmail.com,001-714-489-0170x7510,Male,Finance,Single,83377,2020-09-17,0
1,646855,Isabella,Valenzuela,Isabella Valenzuela,1982-12-16,isabellav@gmail.com,(690)730-4104x5519,Female,IT,Single,79692,2023-04-28,1
2,832469,Margaret,Dennis,Margaret Dennis,1980-07-21,margaretd@gmail.com,(597)862-0752x72930,Female,Finance,Single,50927,2021-12-06,0
3,298776,William,Graham,William Graham,1991-08-10,williamg@gmail.com,(226)371-9683x9613,Male,Finance,Married,62134,2021-08-31,0
4,68629,Mark,Wood,Mark Wood,1973-11-24,markw@gmail.com,384-599-2390x497,Male,IT,Married,41483,2021-06-02,1


In [45]:
# Dropping the column department
employee = employee.drop('department',axis = 1)

Now that the tables are normalized, I will connect to the database and load the data according to my design. The tool I used as my database is MySQL.

## Loading the data into RDBMS

### Connecting Python to MySQL

In [47]:
# Connecting to MySQL schema: whiskey_retail_shop
connection = pymysql.connect(host ='localhost',
                             port=int(3306),
                             user='root',passwd=12345678)
# Creating a Cursor object
cursor = connection.cursor()

### Creating a new Schema

In [49]:
# Create a new schema called whiskey_shop
cursor.execute('''
drop schema if exists lep_store;
''')

cursor.execute('''
create schema lep_store;
''')

# Use the new schema
cursor.execute('''
use lep_store;
''')

0

### Generating empty tables

#### countries_df

In [59]:
cursor.execute('''
DROP TABLE IF EXISTS countries;
''')

cursor.execute('''
CREATE TABLE countries (
    country VARCHAR(100) NOT NULL,
    country_id INT PRIMARY KEY
    );
''')

0

#### department_df

In [63]:
cursor.execute('''
DROP TABLE IF EXISTS departments;
''')

cursor.execute('''
CREATE TABLE departments (
    department_id INT PRIMARY KEY,
    department VARCHAR(100) NOT NULL
    );
''')

0

#### product

In [77]:
cursor.execute('''
DROP TABLE IF EXISTS products;
''')

cursor.execute('''
CREATE TABLE products (
    product_id VARCHAR(100) NOT NULL PRIMARY KEY,
    name VARCHAR(100) NOT NULL,
    price FLOAT NOT NULL
    );
''')

0

#### customer

In [80]:
cursor.execute('''
DROP TABLE IF EXISTS customers;
''')

cursor.execute('''
CREATE TABLE customers (
    customer_id INT PRIMARY KEY NOT NULL,
    first_name VARCHAR(100) NOT NULL,
    last_name VARCHAR(100) NOT NULL,
    full_name VARCHAR(100) NOT NULL,
    birthday date NOT NULL,
    email VARCHAR(100) NOT NULL,
    gender VARCHAR(10) NOT NULL,
    city VARCHAR(100) NOT NULL,
    street VARCHAR(100) NOT NULL,
    country_id INT NOT NULL,
    
    FOREIGN KEY (country_id) REFERENCES countries (country_id)
);
''')

0

#### Employee

In [64]:
cursor.execute('''
DROP TABLE IF EXISTS employees;
''')

cursor.execute('''
CREATE TABLE employees (
    employee_id INT PRIMARY KEY NOT NULL,
    first_name VARCHAR(100) NOT NULL,
    last_name VARCHAR(100) NOT NULL,
    full_name VARCHAR(100) NOT NULL,
    email VARCHAR(100) NOT NULL,
    birthday date NOT NULL,
    employee_phone_number VARCHAR(30),
    gender VARCHAR (10) NOT NULL,
    department_id INT NOT NULL,
    marital_status VARCHAR (15) NOT NULL,
    salary INT NOT NULL,
    hire_date date NOT NULL,

    
    FOREIGN KEY (department_id) REFERENCES departments(department_id)
);
''')

0

#### Payment


In [87]:
cursor.execute('''
DROP TABLE IF EXISTS payments;
''')

cursor.execute('''
CREATE TABLE payments (
    payment_id INT NOT NULL PRIMARY KEY,
    date DATE NOT NULL,
    customer_id INT NOT NULL,
    employee_id INT NOT NULL,
    product_id VARCHAR(100) NOT NULL,
    price FLOAT NOT NULL
    );
''')

0

### Populate the tables

In [69]:
# Convert the Dataframe into a list of arrays
records = countries_df.to_records(index=False)

# Convert the list of arrays into a tuple of tuples
result = tuple(records)

for data in range(0,len(result)):
    
    # Create a new record
    query = "insert into countries (country, country_id) values {}".format(result[data])
    
    # Execute the query
    cursor.execute(query)
    
    
# Commit the transaction
connection.commit()

In [None]:
# Other methods: Convert the DataFrame into a list of tuples
#records = [tuple(row) for row in countries_df.to_numpy()]

# Create the parameterized query
#query = "INSERT INTO countries (country, country_id) VALUES (%s, %s)"

# Execute the bulk insert query with the list of tuples
#cursor.executemany(query, records)

# Commit the transaction
#connection.commit()


In [71]:
# populate data to the department table
records = [tuple(row) for row in department_df.values]

# Create the parameterized query
query = "INSERT INTO departments (department, department_id) VALUES (%s, %s)"

# Execute the bulk insert query with the list of tuples
cursor.executemany(query, records)

# Commit the transaction
connection.commit()

In [78]:
# populate data to the products table
records = [tuple(row) for row in product.values]

# Create the parameterized query
query = "INSERT INTO products (name, price, product_id) VALUES (%s, %s, %s)"

# Execute the bulk insert query with the list of tuples
cursor.executemany(query, records)

# Commit the transaction
connection.commit()


In [83]:
# populate data to the customer table
records = [tuple(row) for row in customer.values]

# Create the parameterized query
query = '''
INSERT INTO customers (customer_id, first_name, last_name, full_name, birthday, email, gender, city, street, country_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''

# Execute the bulk insert query with the list of tuples
cursor.executemany(query, records)

# Commit the transaction
connection.commit()


In [84]:
employee.head()

Unnamed: 0,employee_id,first_name,last_name,full_name,birthday,email,employee_phone_number,gender,marital_status,salary,hire_date,department_id
0,560826,Cathy,Webb,Cathy Webb,1993-05-10,cathyw@gmail.com,001-714-489-0170x7510,Male,Single,83377,2020-09-17,0
1,646855,Isabella,Valenzuela,Isabella Valenzuela,1982-12-16,isabellav@gmail.com,(690)730-4104x5519,Female,Single,79692,2023-04-28,1
2,832469,Margaret,Dennis,Margaret Dennis,1980-07-21,margaretd@gmail.com,(597)862-0752x72930,Female,Single,50927,2021-12-06,0
3,298776,William,Graham,William Graham,1991-08-10,williamg@gmail.com,(226)371-9683x9613,Male,Married,62134,2021-08-31,0
4,68629,Mark,Wood,Mark Wood,1973-11-24,markw@gmail.com,384-599-2390x497,Male,Married,41483,2021-06-02,1


In [85]:
# populate data to the employee table
records = [tuple(row) for row in employee.values]

# Create the parameterized query
query = '''
INSERT INTO employees (employee_id, first_name, last_name, full_name, birthday, email, employee_phone_number, gender, marital_status, salary, hire_date, department_id)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
'''

# Execute the bulk insert query with the list of tuples
cursor.executemany(query, records)

# Commit the transaction
connection.commit()


In [86]:
payment.head()

Unnamed: 0,payment_id,payment_date,customer_id,employee_id,product_id,price
0,389032931,2023-03-23,8946605,511819,1VA01927XT,695000.0
1,139073443,2022-10-06,3082219,811126,1VA02121XT,850000.0
2,957908613,2021-08-03,3016036,452330,1VA01855TI,695000.0
3,918427211,2023-03-16,2669326,147602,1VA01837D,750000.0
4,268262738,2022-03-09,7284948,242020,1VA02038HO,850000.0


In [89]:
# populate data to the payments table
records = [tuple(row) for row in payment.values]

# Create the parameterized query
query = '''
INSERT INTO payments (payment_id, date, customer_id, employee_id, product_id, price)
VALUES (%s, %s, %s, %s, %s, %s)
'''

# Execute the bulk insert query with the list of tuples
cursor.executemany(query, records)

# Commit the transaction
connection.commit()