# Project 4 - Web Scraping & Logistic Regression

## Web Scraping Notebook

The following is my code for web scraping indeed.com for data science job postings

In [1]:
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
import re

pd.set_option('display.max_rows', None, 'display.max_columns', None)

In [2]:
# method to get location from each result
def extract_location_from_result(result):
    try:
        return result.find(class_="location").text
    except:
        return None

In [3]:
# method to get company from each result
def extract_company_from_result(result):
    try:
        return result.find(class_="company").text
    except:
        return None

In [4]:
# method to get job title from each result
def extract_job_from_result(result):
    return result.find(class_="jobtitle").text

In [5]:
# method to get salary from each result
def extract_salary_from_result(result):
    try:
        return result.find('td', class_='snip').find('nobr').text
    except:
        return None

In [6]:
# method to get summary from each result
def extract_summary_from_result(result):
    try:
        return result.find('span', class_='summary').text
    except:
        return None

In [7]:
# scraping code
max_results_per_city = 1000

results = []

for city in set(['New+York', 'Chicago', 'San+Francisco', 'Austin', 'Atlanta','San+Jose','Los+Angeles','Seattle',
                 'Boston','Dallas','Salt+Lake+City','Arlington','Minneapolis','San+Diego','Denver','Portland',
                 'Orlando','Charlotte','Houston','Philadelphia']):
    for start in range(0, max_results_per_city, 10):
        
        url="http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l="+city+"&start="+str(start)
        response = requests.get(url)
        page=response.content
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        x=soup.findAll('div', class_='result', id=re.compile("p_"))
        
        for i in x:
            listx=[]
            listx.append(extract_job_from_result(i))
            listx.append(extract_company_from_result(i))
            listx.append(city)
            listx.append(extract_salary_from_result(i))
            listx.append(extract_summary_from_result(i))
            results.append(listx)

In [8]:
# creating dataframe
df = pd.DataFrame(results, columns = ['JobTitle', 'Company', 'Location', 'Salary','Summary'])
df.shape

(17791, 5)

In [10]:
df.head()

Unnamed: 0,JobTitle,Company,Location,Salary,Summary
0,\nSr Data Scientist\n,\n\n\n Wells Fargo\n,Charlotte,,"\nRight now, we are looking for a tenured data..."
1,\nResearch Analyst\n,\n\n\n Carolinas HealthCare System\n,Charlotte,,\nConducts follow up contacts to collect data ...
2,\nBiostatistics/Bioinformatics Research Scient...,\n\n\n UNC Charlotte\n,Charlotte,"$60,000 - $80,000 a year",\nGenetics/genomics data analysis (esp. Enjoy ...
3,\nDigital Marketing (Data Scientist)\n,"\n\n Kennedy Unlimited Inc, Professional St...",Charlotte,"$100,000 - $130,000 a year",\nResponsibilities of the Digital Marketing (D...
4,\nSenior Quantitative Finance Analyst\n,\n\n iiTS\n,Charlotte,"$140,000 a year","\nAssessing conceptual foundations of a model,..."


In [11]:
# dropping entries for duplicate job postings
df.drop_duplicates(inplace=True)

In [12]:
df.shape

(3960, 5)

In [13]:
# creating csv file for the dataframe that was created from the scrape
df.to_csv('project_4_data.csv',encoding='utf-8')