# Dice.com Web Scraper 

Logical Diagram

<img src='Dice Scraping Logical Diagram.PNG'>

## Imports, Functions, and Databases

**Imports**

In [138]:
import requests
import datetime
import re
import string
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import pickle
import time

In [139]:
scraper_api_key = <Enter your own key here>

**Functions**

In [140]:
"""
Takes: raw_response
"""
def get_job_listing(raw_response: str) -> str:
    try:
        offset = 15
        job_title_index = raw_response.text.find('"job_title"')
        end_job_listing_index = raw_response.text[job_title_index+offset:].find(',')
        return raw_response.text[job_title_index+offset : job_title_index+offset+end_job_listing_index-1]
    except:
        print('Error in get_job_listing')
        return

def get_original_date_posted(raw_response: str) -> str:
    try:
        offset = 16
        date_posted_index = raw_response.text.find('"datePosted"')
        end_date_posted_index = raw_response.text[date_posted_index+offset:].find(',')
        return raw_response.text[date_posted_index+offset : date_posted_index+offset+end_date_posted_index-1]
    except:
        print('Error in get_original_date_posted')
        return

def get_skills(raw_response: str) -> str:
    try:
        offset = 13
        skills_index = raw_response.text.find('"skills"')
        end_skills_index = raw_response.text[skills_index+offset:].find(']')
        return raw_response.text[skills_index+offset : skills_index+offset+end_skills_index-1]
    except:
        print('Error in get_skills')
        return
    
def get_company_name(raw_response: str) -> str:
    try:
        offset = 17
        index = raw_response.text.find('"companyName"')
        end_index = raw_response.text[index+offset:].find(',')
        return raw_response.text[index+offset : index+offset+end_index-1]
    except:
        print('Error in get_company_name')
        return

def get_job_city(raw_response: str) -> str:
    try:
        offset = 14
        index = raw_response.text.find('"jobCity"')
        end_index = raw_response.text[index+offset:].find(']')
        return raw_response.text[index+offset : index+offset+end_index-1]
    except:
        print('Error in get_job_city')
        return    

def get_job_region(raw_response: str) -> str:
    try:
        offset = 16
        index = raw_response.text.find('"jobRegion"')
        end_index = raw_response.text[index+offset:].find(']')
        return raw_response.text[index+offset : index+offset+end_index-1]
    except:
        print('Error in get_job_region')
        return  

def get_job_postal_code(raw_response: str) -> str:
    try:
        offset = 20
        index = raw_response.text.find('"jobPostalCode"')
        end_index = raw_response.text[index+offset:].find(']')
        return raw_response.text[index+offset : index+offset+end_index-1]
    except:
        print('Error in get_job_postal_code')
        return  
    
"""
Takes: unfiltered_response
"""
def get_intext_job_title(text: str) -> str:
    try: 
        end_index = text.find('-') 
        return text[0:end_index-1]
    except:
        print('Error in get_intext_job_title')
        return    
    
def get_intext_company_name(text: str) -> str:
    try:
        start_index = text.find('-')
        end_index = 0
        for char_index in range(len(text)):
            curr_index = char_index + start_index + 1
            if text[curr_index] == '-':
                end_index = curr_index
                break
        return text[start_index+2:end_index-1]
    except:
        print('Error in get_intext_company_name')
        return

def get_intext_date_posted(text: str) -> str:
    try:
        today_date = datetime.date.today()
        if ('hours ago' in text) | ('hour ago' in text):
            return str(today_date)
        if 'weeks ago' in text:
            start_index = text.find('weeks ago')
            ### If posting is made more than 9 weeks ago
            if text[start_index-3].isdigit():
                num_weeks_ago = int(text[start_index-3:start_index-1])
            else:
                num_weeks_ago = int(text[start_index-2])
            threeWeeks = datetime.timedelta(weeks = num_weeks_ago)
            return str(today_date - threeWeeks)
    except:
        print('Error in get_intext_date_posted')
        return

def get_intext_location(text: str) -> str:
    try:
        end_index = text.find('| Dice.com')
        comp_name = get_intext_company_name(text)
        len_company_name = len(comp_name)
        start_index = text.find(comp_name) + len_company_name
        for char in text[start_index:end_index]:
            if char == '-':
                break
            start_index += 1
        return text[start_index+2:end_index-1]
    except:
        print('Error in get_intext_location')
        return    

"""
Takes: uncased_unfiltered_response_nocommas
"""

def get_education_level(text: str) -> str:
    try:
        education_levels_found = []

        phd_spellings = ['pdh','doctorate']
        masters_spellings = ['masters','ms','ma','ms/ma','ma/ms']
        bachelor_spellings = ['bachelor','bachelors','bachelor\'s','bs','ba','bs/ba','ba/bs']
        associate_spellings = ['associates','associate\'s']

        # Check for Phd's
        for phd in phd_spellings:
            if phd in text:
                education_levels_found.append('PhD')
                break

        # Check for masters
        for masters in masters_spellings:
            if masters in text:
                education_levels_found.append('Masters')
                break

        # Check for bachelors
        for bachelor in bachelor_spellings:
            if bachelor in text:
                education_levels_found.append("Bachelor's")
                break

        # Check for associates
        for associate in associate_spellings:
            if associate in text:
                education_levels_found.append("Associate's")

        if len(education_levels_found) == 0:
            return 'NA'
        if len(education_levels_found) == 1:
            return education_levels_found[0]
        else: ### If len(education_levels_found) > 1
            ### Return a range
            return str(education_levels_found[-1]) + ' - ' + str(education_levels_found[0])
    except:
        print('Error in get_education_level')
        return           


"""
General purpose method
"""
def get_word_occurences(text: str, text_file: str):
    try:
        textfile = open(text_file,'r') ### Open the list
        raw_text = textfile.read() ### Read into a string
        textfile.close() ### Close the file
        raw_text = raw_text.lower() ### Lowercase all the strings
        raw_text_list = raw_text.split('\n') ### Convert string into list
        ### If a programming language is in the text, add it to a dictionary
        raw_set = set()
        for word in raw_text_list:
            if word in raw_set:
                continue
            else:
                if text.count(' '+word+' ') > 0:
                    raw_set.add(word)
        try:
            raw_set.remove('')
        except:
            pass

        return str(raw_set).replace('{','').replace('}','').replace('\'','') if len(raw_set) > 0 else None
    except:
        print('Error in get_word_occurences')
        return
        
    
def get_intext_skills(text: str, text_file: str):
    return get_word_occurences(text, text_file)

def get_coding_languages(text: str, text_file: str):
    return get_word_occurences(text, text_file)

def get_technologies(text: str, text_file: str):
    return get_word_occurences(text, text_file)

def get_methodologies(text: str, text_file: str):
    return get_word_occurences(text, text_file)

def get_operating_systems(text: str, text_file: str):
    return get_word_occurences(text, text_file)

def get_remote(text: str) -> bool:
    return True if 'remote' in text else False

def get_years_experience(text: str) -> str:
    try:
        years_list = []
        indexes_of_occurrence = [m.start() for m in re.finditer('years', text)]
        for index in indexes_of_occurrence:
            #print(text[index-5:index])
            for char in text[index-5:index]:
                if char.isdigit():
                    years_list.append(int(char))

        if len(years_list) == 0:
            return ''
        if len(years_list) == 1:
            return str(years_list[0])
        if len(years_list) > 1:
            return str(min(years_list)) + ' - ' + str(max(years_list))
    except:
        print("Error in get_years_experience")
    
def get_all_attributes(raw_response, unfiltered_response, uncased_unfilter_response_nocomma):
    results = []
    ### Raw Response
    results.append(get_job_listing(raw_response))
    results.append(get_original_date_posted(raw_response))
    results.append(get_skills(raw_response))
    results.append(get_company_name(raw_response))
    results.append(get_job_city(raw_response))
    results.append(get_job_region(raw_response))
    results.append(get_job_postal_code(raw_response))
    
    ### Unfiltered Response
    results.append(get_intext_job_title(unfiltered_response))
    results.append(get_intext_company_name(unfiltered_response))
    results.append(get_intext_date_posted(unfiltered_response))
    results.append(get_intext_location(unfiltered_response))
    
    ### Uncased Unfiltered
    results.append(str(get_education_level(uncased_filtered_response_nocomma)))
    results.append(get_intext_skills(uncased_filtered_response_nocomma, './both_skills_list.txt'))
    results.append(get_coding_languages(uncased_filtered_response_nocomma, './coding_languages.txt'))
    results.append(get_technologies(uncased_filtered_response_nocomma, './technologies.txt'))
    results.append(get_methodologies(uncased_filtered_response_nocomma, './methodologies.txt'))
    results.append(get_operating_systems(uncased_filtered_response_nocomma, './operating_systems.txt'))
    results.append(str(get_remote(uncased_filtered_response_nocomma)))
    results.append(str(get_years_experience(uncased_filtered_response_nocomma)))
    
    ### Add the date of processing
    now = datetime.datetime.now() # current date and time
    year = now.strftime("%Y")
    month = now.strftime("%m")
    day = now.strftime("%d")
    date = month+day+year
    results.append(str(date))
    
    ### Salary (still need to implement method)
    results.append(None)
    
    return results

def get_job_links_from_links(clean_links):
    only_job_links = []
    for link in clean_links:
        result = re.search("^\/jobs\/detail", str(link))
        if result is not None:
            whole_link = "https://www.dice.com"+link
            only_job_links.append(whole_link)
    return only_job_links

def add_links_to_master_set(set_of_links, file_location):
    ### Open the existing set of links
    file = open(file_location,'rb')
    existing_links = pickle.load(file)
    file.close()
    
    len_b4 =len(existing_links)
    
    ### Add the existing set with the new one
    existing_links.update(set_of_links)
    print("Number of links added: {}".format(len(existing_links) - len_b4))
    
    ### Write back to pickle
    file = open(file_location,'wb')
    pickle.dump(existing_links, file)
    file.close()
    
    return True

def print_existing_links(file_location):
    file = open(file_location,'rb')
    existing_links = pickle.load(file)
    file.close()
    [print(link) for link in existing_links]
    
def get_existing_links(file_location):
    file = open(file_location,'rb')
    existing_links = pickle.load(file)
    file.close()
    return exisiting_links

def get_responses(URL):
    try:
        ### Unique Key
        payload = {'api_key': scraper_api_key, 
                   'url': URL}
        ### Make request
        raw_response = requests.get('http://api.scraperapi.com', params=payload)

        ### If bad request, exit
        if raw_response == 200:
            return

        ### Remove the html
        unfiltered_response = ' '.join(str(BeautifulSoup(raw_response.text.
                                                         replace(">","> ").
                                                         replace("<"," <").
                                                         replace("\n","").
                                                         replace("\t",""), 'lxml').text).split())

        ### Clean the response more
        end_characters = 'Save Create Alert'
        end_index = unfiltered_response.find(end_characters)
        start_characters = '(email@domain.com). Create Alert '
        start_index = unfiltered_response.find(start_characters)
        Cased_filtered_response = unfiltered_response[start_index+len(start_characters):end_index]
        uncased_filtered_response = Cased_filtered_response.lower()
        uncased_filtered_response_nocomma = uncased_filtered_response.replace(',',' ') ### Replace the comma to help with identifying skills
        return raw_response, unfiltered_response, uncased_filtered_response_nocomma
    except:
        print('Error in get_responses')
        return

### For fixing any date formatting issues    
dates_fix = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
def fix_dates(row):
    try:
        temp_list = row.split('-')
        if temp_list[1] in dates_fix.keys():
            temp_str_1 = dates_fix.get(temp_list[1])
            temp_str_2 = temp_list[0]
            return str(temp_str_1) + ' - ' + str(temp_str_2)
    except:
        return row
    
### For debugging    
def check_if_bad_link(URL):
    return True if df_failed_urls[df_failed_urls['URL'] == URL].shape[0] > 0 else False
    
def add_bad_link(URL):
    if df_failed_urls[df_failed_urls['URL'] == URL].shape[0] > 0:
        new_val = df_failed_urls[df_failed_urls['URL'] == URL]['attemps'] + 1
        if int(new_val) > 3:
            index_of_url = df_failed_urls[df_failed_urls['URL'] == URL].index
            df_failed_urls.drop(index_of_url, inplace=True)
            df_failed_urls.reset_index(inplace=True, drop=True)
            print('URL dropped: {}'.format(URL))
            return
        df_failed_urls.loc[df_failed_urls[df_failed_urls['URL'] == URL].index, 'attemps'] = new_val
    else:
        df_failed_urls.loc[len(df_failed_urls)] = [URL, 1]
        
def remove_link(URL):
    index_of_url = df_failed_urls[df_failed_urls['URL'] == URL].index
    df_failed_urls.drop(index_of_url, inplace=True)
    df_failed_urls.reset_index(inplace=True, drop=True)
    print('Removed:', URL)

**Databases**

In [142]:
df_listings = pd.read_csv('listings_attributes.csv')
df_listings_and_urls = pd.read_csv('listings_and_urls.csv')
df_failed_urls = pd.read_csv('failed_links_attemps.csv')

In [143]:
df_listings.head()

Unnamed: 0,job_listing,original_date_posted,skills,company_name,job_city,job_region,job_postal_code,intext_job_title,intext_company_name,intext_date_posted,...,intext_skills,coding_languages,technologies,methodologies,operating_systems,remote,years_experience,date_of_processing,salary,URL
0,Senior Data Scientist,2021-06-28T22:17:51Z,"Artificial Intelligence, Python, IT, SAS, SQL,...",New York Life Insurance Company,New York,NY,10001,Senior Data Scientist,New York Life Insurance Company,,...,"machine learning, model deployment, sales, tec...","sas, r, processing, spark, lasso, sql, python",,incremental,,False,3 - 5,6292021,,https://www.dice.com/jobs/detail/Senior-Data-S...
1,Data Scientist,2021-06-01T00:32:45Z,"Research, Computer, Programming, Python, Java,...",comScore,Amsterdam,,12010,Data Scientist,comScore,,...,"machine learning, online advertising, clusteri...","scala, r, javascript, source, java, sql, python",,,,False,2,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...
2,Data Scientist,2021-06-09T18:16:51Z,"Data, collect, clean, analyze",University Of Delaware,Newark,DE,19702,Data Scientist,University Of Delaware,6/15/2021,...,"sas, data analysis, algorithms, r, writing, ma...","sas, r, stata, clean, sql, python",,,college,False,3 - 5,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...
3,Principal Data Scientist - Search,2021-04-30T23:30:59Z,"Algorithms, Engineers, Python, Java, Data Mini...",Walmart,Sunnyvale,CA,94086,Principal Data Scientist,Search,6/29/2021,...,"machine learning, cadence, online advertising,...","plus, scala, r, ml, source, java, spark, python",tensorflow,,,False,3 - 7,6292021,,https://www.dice.com/jobs/detail/Principal-Dat...
4,Data Scientist - Entry Level,2021-05-07T00:30:18Z,"Laboratory, Security, Applications, Java, Pyth...",Lawrence Livermore National Laboratory,Livermore,CA,94550,Data Scientist,Entry Level,,...,"database, machine learning, research and devel...","r, matlab, c, python, java, q, c++, processing",,,linux,False,,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...


In [144]:
df_listings_and_urls.head()

Unnamed: 0,source_url,listing
0,https://www.dice.com/jobs/detail/Senior-Data-S...,Senior Data Scientist - New York Life Insuranc...
1,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist - comScore - Amsterdam | Dice.c...
2,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist - University Of Delaware - Newa...
3,https://www.dice.com/jobs/detail/Principal-Dat...,Principal Data Scientist - Search - Walmart - ...
4,https://www.dice.com/jobs/detail/Data-Scientis...,Data Scientist - Entry Level - Lawrence Liverm...


In [154]:
df_failed_urls.head()

Unnamed: 0,URL,attemps


## Query Dice.com

*Getting the first inital links for a '__Data Scientist__'*

In [123]:
URL = 'https://www.dice.com/jobs/q-Data+Scientist-jobs'
### Unique Key
payload = {'api_key': scraper_api_key, 
           'url': URL}
### Make request
r = requests.get('http://api.scraperapi.com', params=payload)

### Clean up the raw text
soup = BeautifulSoup(r.text, 'lxml')

### Get all the links
clean_links = []
for link in soup.find_all('a'):
    current_link = link.get('href')
    if current_link not in clean_links:
        clean_links.append(current_link)

## Getting the total number of pages w/ links

In [124]:
job_lengths_list = []
for link in clean_links:
    result = re.search("^\/jobs\/.*p=", str(link)) # Starts with /jobs/, has some characters, then has p= in it
    if result is not None:
        whole_link = "https://www.dice.com"+link
        job_lengths_list.append(whole_link)
        
start_index = job_lengths_list[-1].find("=")
number_of_pages = int(job_lengths_list[-1][start_index+1:])
print('There are {} pages with links'.format(number_of_pages))

1672
There are 1672 pages with links


## Query each page to get all the links

*Note, this may return a very lage number of pages. If you do not want to use up thousands of requests at once, add a __break__*

In [160]:
### Create a set of links to run
set_of_links = set()
### get the URL's that have already been used
used_URLs = set(df_listings_and_urls['source_url'])
failed_URLs = df_failed_urls['URL'].tolist()

new_links_found = 0
offset = 270

for i in range(number_of_pages-offset):
    URL = 'https://www.dice.com/jobs/q-Data+Scientist-jobs?p=' + str(i+offset)
    
    ## Unique Key
    payload = {'api_key': scraper_api_key, 
               'url': URL}
    ### Make request
    r = requests.get('http://api.scraperapi.com', params=payload)

    ### Clean up the raw text
    soup = BeautifulSoup(r.text, 'lxml')

    ### Get all the links
    clean_links = []
    for link in soup.find_all('a'):
        current_link = link.get('href')
        if current_link not in clean_links:
            clean_links.append(current_link)
            
    ### Filter the links more
    all_links_to_scrape = get_job_links_from_links(clean_links)
    
    """ 
    ####################################################
    Check to make sure the link hasn't been used already
    ####################################################
    """   
    for link in all_links_to_scrape:
        
        ### Check to make sure the link doesn't already exist in the database
        if link in used_URLs:
            print('Link already used:', link)
            continue
            
        ### Check to make sure the link isn't already a bad link   
        if link in failed_URLs:
            print('Link is marked as a bad link already', link)
            continue
            
        ### Add each link to a set to prevent redudancy
        set_of_links.add(link)
        new_links_found += 1
        
    """
    Add an optional break down here
    """
    if i == 5: ### Will capture around 120 links
        break
    else: 
        continue
        
print('{} new links found!'.format(new_links_found))
print('{} links in set_of_links'.format(len(set_of_links)))

120 new links found!
116 links in set_of_links


## Run the web scraper over each link

In [161]:
failed_links = []
failed_listing_counter = 0
num_links_added = 0

for job_link in list(set_of_links):    
    try:
        ### Get the listing
        raw_response, unfiltered_response, uncased_filtered_response_nocomma = get_responses(job_link)
        if '404 Not Found' in unfiltered_response:
            ### Add the bad response to list of links tried
            add_bad_link(job_link)
            print("404 Not Found")
                  
            failed_listing_coutner += 1
            if failed_listing_counter == 3:
                print("Cooling down for 10 seconds before continuing...")
                ### Should allow for a proxy to be reset (theoretically)
                time.sleep(10)
                failed_listing_coutner = 0
            continue
            
        ### Get the attributes
        results = get_all_attributes(raw_response, unfiltered_response, uncased_filtered_response_nocomma)
        ### Add url to results
        results += [job_link]
        ### Add the attributes to the databases
        df_listings.loc[len(df_listings)] = results
        ### Add the unfiltered_response to a seperate db
        df_listings_and_urls.loc[len(df_listings_and_urls)] = [job_link, unfiltered_response]
        
        num_links_added += 1
    except:
        print('Bad Listing')
        continue
        
### Save the results
df_listings.to_csv('listings_attributes.csv', index=False)
df_listings_and_urls.to_csv('listings_and_urls.csv', index=False)
df_failed_urls.to_csv('failed_links_attemps.csv', index=False)

### Save to pickles
file = open('listings_attributes.pkl','wb')
pickle.dump(df_listings, file)
file.close()

file = open('raw_listings_and_urls.pkl','wb')
pickle.dump(df_listings_and_urls, file)
file.close()

file = open('failed_links_attemps.pkl','wb')
pickle.dump(df_failed_urls, file)
file.close()

print('{} links added!'.format(num_links_added))

116 links added!


In [162]:
df_listings

Unnamed: 0,job_listing,original_date_posted,skills,company_name,job_city,job_region,job_postal_code,intext_job_title,intext_company_name,intext_date_posted,...,intext_skills,coding_languages,technologies,methodologies,operating_systems,remote,years_experience,date_of_processing,salary,URL
0,Senior Data Scientist,2021-06-28T22:17:51Z,"Artificial Intelligence, Python, IT, SAS, SQL,...",New York Life Insurance Company,New York,NY,10001,Senior Data Scientist,New York Life Insurance Company,,...,"machine learning, model deployment, sales, tec...","sas, r, processing, spark, lasso, sql, python",,incremental,,False,3 - 5,6292021,,https://www.dice.com/jobs/detail/Senior-Data-S...
1,Data Scientist,2021-06-01T00:32:45Z,"Research, Computer, Programming, Python, Java,...",comScore,Amsterdam,,12010,Data Scientist,comScore,,...,"machine learning, online advertising, clusteri...","scala, r, javascript, source, java, sql, python",,,,False,2,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...
2,Data Scientist,2021-06-09T18:16:51Z,"Data, collect, clean, analyze",University Of Delaware,Newark,DE,19702,Data Scientist,University Of Delaware,6/15/2021,...,"sas, data analysis, algorithms, r, writing, ma...","sas, r, stata, clean, sql, python",,,college,False,3 - 5,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...
3,Principal Data Scientist - Search,2021-04-30T23:30:59Z,"Algorithms, Engineers, Python, Java, Data Mini...",Walmart,Sunnyvale,CA,94086,Principal Data Scientist,Search,6/29/2021,...,"machine learning, cadence, online advertising,...","plus, scala, r, ml, source, java, spark, python",tensorflow,,,False,3 - 7,6292021,,https://www.dice.com/jobs/detail/Principal-Dat...
4,Data Scientist - Entry Level,2021-05-07T00:30:18Z,"Laboratory, Security, Applications, Java, Pyth...",Lawrence Livermore National Laboratory,Livermore,CA,94550,Data Scientist,Entry Level,,...,"database, machine learning, research and devel...","r, matlab, c, python, java, q, c++, processing",,,linux,False,,6292021,,https://www.dice.com/jobs/detail/Data-Scientis...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
628,Talend Developer,2021-06-03T18:32:35Z,"Apache HBase, Apache Hadoop, Apache Hive, Apac...",Wipro Ltd.,West Lake Hills,TX,78746,Talend Developer,Wipro Ltd.,2021-06-09,...,"oracle, apache hive, hdfs, pl/sql, talend, pyt...","oracle, pl/sql, spark, sql, python",,,hive,False,6,06302021,,https://www.dice.com/jobs/detail/Talend-Develo...
629,Principal Machine Learning Engineer,2021-06-11T22:05:05Z,"Analytics, Apache HTTP Server, Apache Lucene, ...",Inspire Recruitment Inc.,Austin,TX,73301,Principal Machine Learning Engineer,Inspire Recruitment Inc.,2021-06-16,...,"web services, computer science, java, wins, da...","java, pipelines, rest, python, spark, sql, ml,...",,,vision,False,0 - 2,06302021,,https://www.dice.com/jobs/detail/Principal-Mac...
630,ETL/Hadoop Consultant,2021-06-28T17:15:37Z,"ETL Data Engineer, Hadoop, Teradata DB2 Oracle...",Cyma Systems Inc,Atlanta,GA,30301,ETL/Hadoop Consultant,Cyma Systems Inc,,...,"oracle, batch processing, root cause analysis,...","oracle, source, db2, sr, shell, spark, sql, pr...",,,"hive, root",False,3 - 8,06302021,,https://www.dice.com/jobs/detail/ETL%26%2347Ha...
631,AIP & MIP Architect,2021-06-13T19:05:57Z,"Architecture, Best practices, Compliance, Cons...",Wipro Ltd.,Foster City,CA,94404,AIP & MIP Architect,Wipro Ltd.,2021-06-16,...,"microsoft windows azure, saas, help desk, info...",,,,,False,0 - 5,06302021,,https://www.dice.com/jobs/detail/AIP-%26-MIP-A...


## Run the failed links again

In [131]:
failed_URLs = df_failed_urls['URL'].tolist()

failed_links = []
failed_listing_counter = 0

for job_link in list(failed_URLs):    
    try:
        ### Get the listing
        raw_response, unfiltered_response, uncased_filtered_response_nocomma = get_responses(job_link)
        if '404 Not Found' in unfiltered_response:
            ### Add the bad response to list of links tried
            add_bad_link(job_link)
            print("404 Not Found", job_link)
                  
            failed_listing_coutner += 1
            if failed_listing_counter == 3:
                print("Cooling down for 10 seconds before continuing...")
                time.sleep(10)
                failed_listing_coutner = 0
            continue
            
        ### Get the attributes
        results = get_all_attributes(raw_response, unfiltered_response, uncased_filtered_response_nocomma)
        ### Add url to results
        results += [job_link]
        ### Add the attributes to the databases
        df_listings.loc[len(df_listings)] = results
        ### Add the unfiltered_response to a seperate db
        df_listings_and_urls.loc[len(df_listings_and_urls)] = [job_link, unfiltered_response]
        print('Link has been added!', job_link)
        remove_link(job_link)
    except:
        print('Bad Listing')
        continue
        
### Save the results
df_listings.to_csv('listings_attributes.csv', index=False)
df_listings_and_urls.to_csv('listings_and_urls.csv', index=False)
df_failed_urls.to_csv('failed_links_attemps.csv', index=False)

### Save to pickles
file = open('listings_attributes.pkl','wb')
pickle.dump(df_listings, file)
file.close()

file = open('raw_listings_and_urls.pkl','wb')
pickle.dump(df_listings_and_urls, file)
file.close()

file = open('failed_links_attemps.pkl','wb')
pickle.dump(df_failed_urls, file)
file.close()

Link has been added! https://www.dice.com/jobs/detail/Junior-Natural-Language-Processing-Data-Scientist-Leidos-Reston-VA-20170/SCNCAPI2/R%26%234500059393
Link has been added! https://www.dice.com/jobs/detail/Google-Cloud-Platform-Senior-Cloud-Engineer-Softpath-System%2C-LLC.---/softpath/7019333
