In [1]:
from flask import Flask,render_template,request,redirect,send_file
import urllib.request as req
from bs4 import BeautifulSoup
import csv

LIMIT=50

def extract_indeed_pages(url):
    result= req.urlopen(url)
    soup = BeautifulSoup(result,"html.parser")
    pagination=soup.find("div",{"class":"pagination"})
    links= pagination.findAll("a")
    pages= []
    for link in links[:-1]:
        pages.append(int(link.string))
        max_page=pages[-1]
    return max_page

def extract_job(html):
    title=html.find("a",{"class":"jobtitle"})["title"]
    #company= html.find("span",{"class":"company"}).get_text(strip=True)
    loc=html.find("div",{"class":"recJobLoc"})["data-rc-loc"]
    #if company is None:
    #    company=None
    company=html.find("span",{"class":"company"})
    if company:
        company=str(company.string).strip()
    else:
        company=None
       
    job_id=html["data-jk"]
    link=f"https://kr.indeed.com/%EC%B1%84%EC%9A%A9%EB%B3%B4%EA%B8%B0?jk={job_id}"
    return {'title':title,'company':company,'location':loc,
            "link":link}

def extract_indeed_jobs(last_page,url):
   
    jobs=[]
    for page in range(last_page): 
        print(f"scarapping indeed page {page}")
        result=req.urlopen(f"{url}&start={page*LIMIT}")
        soup = BeautifulSoup(result,"html.parser")
        results= soup.findAll("div",{"data-tn-component":"organicJob"})
        for res in results:
            job=extract_job(res)
            jobs.append(job)
        
    return jobs

def get_indeed_jobs(word):
    INDEED=f"https://kr.indeed.com/%EC%B7%A8%EC%97%85?q={word}&limit={LIMIT}"
    last_page= extract_indeed_pages(INDEED)
    jobs= extract_indeed_jobs(last_page,INDEED)
    return jobs

def save_to_file(jobs,word):
    file= open(f"{word}_jobs.csv",encoding='utf-8-sig',mode="w")
    writer =csv.writer(file)
    writer.writerow(["title","company","location","link"])
    for job in jobs:
        writer.writerow(list(job.values()))
    return print("done!")

In [None]:
app=Flask("SuperScrapper")
db={}

@app.route("/")
def home():
    return render_template("home.html")
@app.route("/report")
def report():
    word= request.args.get('word')
    if word:
        word=word.lower()
        fromDb =db.get(word)
        if fromDb:
            jobs=fromDb
        else:
            jobs=get_indeed_jobs(word)
            db[word]=jobs
    else:
        return redirect("/")
    return render_template("report.html",searchingBy=word,resultNumber=len(jobs),jobs=jobs)

@app.route("/export")
def export():
    try:
        word=request.args.get('word')
        if not word:
            raise Exception()
        word=word.lower()
        jobs=db.get(word)
        if not jobs:
            raise Exception()
        save_to_file(jobs,word)
        return send_file(f"{word}_jobs.csv",as_attachment=True)
    except:
        return redirect("/")
    
app.run(host="127.0.0.1")

 * Serving Flask app "SuperScrapper" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [24/Jul/2020 15:15:18] "[37mGET / HTTP/1.1[0m" 200 -


scarapping indeed page 0
scarapping indeed page 1
scarapping indeed page 2
scarapping indeed page 3
scarapping indeed page 4
scarapping indeed page 5
scarapping indeed page 6


127.0.0.1 - - [24/Jul/2020 15:15:34] "[37mGET /report?word=go HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jul/2020 15:15:35] "[37mGET /export?word=go HTTP/1.1[0m" 200 -


done!
scarapping indeed page 0
scarapping indeed page 1
scarapping indeed page 2
scarapping indeed page 3
scarapping indeed page 4
scarapping indeed page 5
scarapping indeed page 6
scarapping indeed page 7
scarapping indeed page 8
scarapping indeed page 9
scarapping indeed page 10
scarapping indeed page 11
scarapping indeed page 12
scarapping indeed page 13
scarapping indeed page 14
scarapping indeed page 15
scarapping indeed page 16


127.0.0.1 - - [24/Jul/2020 15:18:36] "[37mGET /report?word=python HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jul/2020 15:21:03] "[37mGET /export?word=python HTTP/1.1[0m" 200 -


done!


127.0.0.1 - - [24/Jul/2020 15:21:40] "[37mGET / HTTP/1.1[0m" 200 -


scarapping indeed page 0
scarapping indeed page 1
scarapping indeed page 2
scarapping indeed page 3
scarapping indeed page 4
scarapping indeed page 5
scarapping indeed page 6
scarapping indeed page 7
scarapping indeed page 8
scarapping indeed page 9
scarapping indeed page 10
scarapping indeed page 11
scarapping indeed page 12
scarapping indeed page 13
scarapping indeed page 14
scarapping indeed page 15
scarapping indeed page 16
scarapping indeed page 17
scarapping indeed page 18
scarapping indeed page 19


127.0.0.1 - - [24/Jul/2020 15:22:21] "[37mGET /report?word=c HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jul/2020 15:23:29] "[37mGET /export?word=c HTTP/1.1[0m" 200 -


done!


127.0.0.1 - - [24/Jul/2020 15:24:48] "[37mGET /report?word=python HTTP/1.1[0m" 200 -


scarapping indeed page 0
scarapping indeed page 1


127.0.0.1 - - [24/Jul/2020 15:24:56] "[37mGET /report?word=vue HTTP/1.1[0m" 200 -
127.0.0.1 - - [24/Jul/2020 15:25:19] "[37mGET /report?word=go HTTP/1.1[0m" 200 -


scarapping indeed page 0
scarapping indeed page 1
scarapping indeed page 2
scarapping indeed page 3
scarapping indeed page 4
scarapping indeed page 5
scarapping indeed page 6
scarapping indeed page 7
scarapping indeed page 8
scarapping indeed page 9
scarapping indeed page 10
scarapping indeed page 11
scarapping indeed page 12
scarapping indeed page 13
scarapping indeed page 14
scarapping indeed page 15
scarapping indeed page 16
scarapping indeed page 17
scarapping indeed page 18
scarapping indeed page 19


127.0.0.1 - - [24/Jul/2020 15:26:08] "[37mGET /report?word=web HTTP/1.1[0m" 200 -
