# Setup Cloud SQL - SQL Alchemy
---

Install [pymysql](https://docs.sqlalchemy.org/en/14/dialects/mysql.html#module-sqlalchemy.dialects.mysql.pymysql)

In [181]:
!pip install pymysql



Authenticate using ***application default credentials***

In [None]:
!gcloud auth application-default login

Need to add new network if use public IP: `0.0.0.0/0`



### Code

In [182]:
# import create_engine from sql alchemy
from sqlalchemy import create_engine
from sqlalchemy.engine.url import URL

# import pandas
import pandas as pd

# import drive
from google.colab import drive

Mount gdrive folder to get SSL files

In [183]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Build connection without SSL

In [None]:
engine = create_engine(
  URL.create(
    drivername="mysql+pymysql",
    username="root",
    password="CKnEx4mOtx2EADil",
    host="34.72.191.193",
    port=3306,
    database="dummy_db_2"
  )
)

conn = engine.connect()

Build connection with SSL

In [184]:
path_to_ssl = '/content/drive/MyDrive/Colab Notebooks/creds/SSL'

ssl_files={
    "ssl_ca": f"{path_to_ssl}/server-ca.pem",
    "ssl_cert": f"{path_to_ssl}/client-cert.pem",
    "ssl_key": f"{path_to_ssl}/client-key.pem"
}

engine = create_engine(
  URL.create(
    drivername="mysql+pymysql",
    username="root",
    password="CKnEx4mOtx2EADil",
    host="34.72.191.193",
    port=3306,
    database="dummy_db_2"
  ),
  connect_args=ssl_files
)

conn = engine.connect()

# Create Table
---

To create table *(this query doesn't return a result)*

In [185]:
query = """
CREATE TABLE IF NOT EXISTS post (
  id INT(3) PRIMARY KEY AUTO_INCREMENT,
  company_name VARCHAR(255),
  skills VARCHAR(255),
  more_info VARCHAR(255),
  publish_date VARCHAR(255),
  location VARCHAR(255),
  min_yoe INT(2),
  max_yoe INT(2)
)
"""

query_result = conn.execute(query)

# Web Scraping Using Beautiful Soup
---

Mount gdrive folder

In [186]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Change directory to the working directory

In [187]:
%cd /content/drive/MyDrive/Colab Notebooks/github/web-scraping-to-cloud-sql

/content/drive/MyDrive/Colab Notebooks/github/web-scraping-to-cloud-sql


Import Libraries

In [188]:
# import beautiful soup
from bs4 import BeautifulSoup

# import requests
import requests

# import time
import time

Define `fetch_html` function that fetch HTML text from given URL per pages

In [189]:
def fetch_html(page):
  base_url = 'https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=python'
  page_param = f'&sequence={page}'
  html_content = requests.get(f'{base_url}{page_param}')
  html_text = html_content.text
  
  return html_text

Define `parse_html` function that parse HTML text using [beautiful soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) & html parser

In [190]:
def parse_html(html_text):
  soup = BeautifulSoup(html_text, 'html.parser')

  return soup

Get user input

In [191]:
print('Input how many pages you want to scrape?')
page_number = input('> ')

Input how many pages you want to scrape?
> 4


Define `find_jobs()` function that scraping **company**, **skill**, & **publish date** info

In [192]:
def find_jobs(soup):
  jobs = soup.find_all('li', class_ = 'clearfix job-bx wht-shd-bx')

  list_of_dict_jobs = []

  for idx, job in enumerate(jobs):
    company = job.find('h3', class_ = 'joblist-comp-name').text.replace(' ', ' ').strip()
    skills = job.find('span', class_ = 'srp-skills').text.strip()
    publish_date = job.find('span', class_ = 'sim-posted').text.strip()
    more_info = job.header.h2.a['href']

    dict_job = {}
    dict_job['company'] = company
    dict_job['skills'] = skills
    dict_job['publish_date'] = publish_date
    dict_job['more_info'] = more_info

    list_of_dict_jobs.append(dict_job)

  return list_of_dict_jobs

Define `find_loct_and_yoe` function that scraping **location** & **years of experience**

In [193]:
def find_loct_and_yoe(soup):

  yoe_and_loct_lists = soup.find_all('ul', class_="top-jd-dtl")
  list_dict_loct_yoe = []

  for yoe_and_loct_list in yoe_and_loct_lists:
    yoe_child_lists = yoe_and_loct_list.find_all('li')
    temp_yoe = yoe_child_lists[0].text
    split_temp_yoe = temp_yoe.split('-')

    min_year = split_temp_yoe[0].strip()[-1]
    max_year = split_temp_yoe[1].strip()[0]

    temp_loc = yoe_child_lists[1].text
    split_temp_loc = temp_loc.strip().split('\n')
    if len(split_temp_loc) > 1:
      loct = split_temp_loc[1] 
    else:
      loct = None

    dict_loct_yoe = {}
    dict_loct_yoe['min_yoe'] = int(min_year)
    dict_loct_yoe['max_yoe'] = int(max_year)
    dict_loct_yoe['location'] = loct

    list_dict_loct_yoe.append(dict_loct_yoe)

  return list_dict_loct_yoe

Define `combine_fields()` function that combine the **jobs**, **location**, & **yoe**

In [194]:
def combine_fields(jobs, loct_and_yoe):
  
  for idx in range(len(jobs)):
    jobs[idx].update(loct_and_yoe[idx])

  return jobs

Define `write_to_file` function that writes jobs into `.txt` file for every page

In [195]:
def write_to_file(jobs, page):
  with open(f"posts/{page}.txt", "a") as f:
    for job in jobs:
      f.write(f"company name: {job.get('company')} \n")
      f.write(f"skills: {job.get('skills')} \n")
      f.write(f"more info: {job.get('more_info')} \n")
      f.write(f"publish date: {job.get('publish_date')} \n")
      f.write(f"location: {job.get('location')} \n")
      f.write(f"min_yoe: {job.get('min_yoe')} \n")
      f.write(f"max_yoe: {job.get('max_yoe')} \n")
      f.write(f"--------------------------------------- \n")
    
  print(f"file saved: {page}")

Main Code

In [196]:
result_posts = []

for idx in range(int(page_number)):
  html_text = fetch_html(idx+1)
  soup = parse_html(html_text)
  jobs = find_jobs(soup)
  loct_and_yoe = find_loct_and_yoe(soup)
  posts = combine_fields(jobs, loct_and_yoe)
  write_to_file(posts, idx+1)

  result_posts += posts

file saved: 1
file saved: 2
file saved: 3
file saved: 4


# Insert data to db
---

In [201]:
query="""
  INSERT INTO dummy_db_2.post (company_name, skills, more_info, publish_date, location, min_yoe, max_yoe) 
  VALUES (%s, %s, %s, %s, %s, %s, %s)
"""

num_of_rows_added = 0
for post in result_posts:
  data = (post['company'], post['skills'], post['more_info'], post['publish_date'], post['location'], post['min_yoe'], post['max_yoe'])
  id=conn.execute(query, data)
  num_of_rows_added += id.rowcount

print(f'success inserted {num_of_rows_added} rows')

success inserted 100 rows


To show a table

Fetch methods:


1.   `.fetchall()`: to return all rows of query result
2.   `.fetchmany(size)`: to return number of rows specified by **size**
3.   `.fetchone()`: to return only the 1st row of query result



In [199]:
query_get_cols = "SHOW COLUMNS FROM dummy_db_2.post"
query_result = conn.execute(query_get_cols)
column_names = query_result.fetchall()
columns = [col[0] for col in column_names]

query_get_data = "select * from dummy_db_2.post"
query_result = conn.execute(query_get_data)
data = query_result.fetchall()

df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,id,company_name,skills,more_info,publish_date,location,min_yoe,max_yoe
0,1,Pure Tech Codex Private Limited,"rest , python , database , django , de...",https://www.timesjobs.com/job-detail/python-pu...,Posted 1 day ago,Pune,2,3
1,2,Surya Informatics Solutions Pvt. Ltd.,"python , web technologies , linux , mobi...",https://www.timesjobs.com/job-detail/python-su...,Posted few days ago,Chennai,0,3
2,3,Gemini Solutions,"python , mobile , svn , nosql , python...",https://www.timesjobs.com/job-detail/qa-python...,Posted few days ago,Gurgaon,4,7
3,4,Electrobrain modern technologies pvt ltd\r\n ...,"CSS , HTML , Java , Sql , c ++ , Oracle",https://www.timesjobs.com/job-detail/python-de...,Posted a month ago,"Bhagalpur, Muzaffarpur, Patna",2,5
4,5,GT SOLUSN\r\n (More Jobs),"""Python""",https://www.timesjobs.com/job-detail/python-de...,Posted few days ago,"Pune, Jaipur",3,8
...,...,...,...,...,...,...,...,...
95,96,ari global solutions,"udp , c , python , exception handling ...",https://www.timesjobs.com/job-detail/c-with-py...,Posted few days ago,Hyderabad/Secunderabad,2,5
96,97,APPLYCUP HIRING SOLUTIONS LLP,"python , html5 , svn , storage , javas...",https://www.timesjobs.com/job-detail/python-de...,Posted 2 days ago,Ahmedabad,2,6
97,98,RGF Professional,"c , rest api , sql , java , teamcity...",https://www.timesjobs.com/job-detail/ee-python...,Posted 2 days ago,Gurgaon,6,9
98,99,APPLYCUP HIRING SOLUTIONS LLP,"security compliance , html5 , storage , ...",https://www.timesjobs.com/job-detail/python-de...,Posted 2 days ago,Surat,3,8


To truncate table *(this query doesn't return a result)*

In [200]:
query = "TRUNCATE TABLE dummy_db_2.post"

query_result = conn.execute(query)