In [135]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [136]:
upwork = pd.read_csv("../datasets/raw_upwork_dataScience.csv")
upwork.head()

Unnamed: 0,content
0,"<div class=""col-12 cfe-ui-job-details-content""..."
1,"<div class=""col-12 cfe-ui-job-details-content""..."
2,"<div class=""col-12 cfe-ui-job-details-content""..."
3,"<div class=""col-12 cfe-ui-job-details-content""..."
4,"<div class=""col-12 cfe-ui-job-details-content""..."


In [137]:
upwork.shape

(880, 1)

In [138]:
new_upwork = pd.DataFrame()

In [139]:
soup = []

for job in upwork['content'].values:
    job_element = BeautifulSoup(job, "html.parser")
    soup.append(job_element)

## Titles and Descriptions

In [140]:
titles, descriptions = [], []
for job in soup:
    title = job.find("h1", {"class":"my-0 mr-10 display-rebrand"})
    description = job.find("div", {"class":"job-description break mb-0"})
    
    titles.append(title.text)
    descriptions.append(description.text)
    
print(len(titles))
print(len(descriptions))
    

880
880


## Skills

In [141]:
skills_list = []

for job in soup:
    skills = job.find_all("span", {"class":"cfe-ui-job-skill up-skill-badge disabled m-0-left m-0-top m-xs-bottom"})
    skills = [skill.text for skill in skills]
    skills_list.append(skills)

In [142]:
len(skills_list)

880

In [143]:
skills_list[8]

['Business Intelligence',
 'Data Analytics',
 'Data Interpretation',
 'Marketing Analytics',
 'Operations Analytics',
 'Product Analytics',
 'Human Resources Analytics',
 'Sales Analytics',
 'Google Analytics',
 'Data Analysis']

In [144]:
import re

def treat_string(text):
    text = text.replace("\n", " ")
    text = text.replace("/hr", " ")
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    text = text.replace("$", "")
    text = re.sub("[^0-9.]", "", text)
    return text

## About the client

In [145]:
def get_client_informations(data_qa):
    info_list = []
    
    for i, job in enumerate(soup):
        info = job.select_one(f"li[data-qa={data_qa}]")
        if info != None:
            info = info.find("strong").text
            info_list.append((i, info))
            
    return info_list

#### Countries

In [146]:
countries = get_client_informations('client-location')

#### Jobs Posted

In [147]:
jobs_posted = get_client_informations('client-job-posting-stats')
jobs_posted = [(i, treat_string(value)) for (i, value) in jobs_posted]

#### Avarege paid for hour - Ex: \$ 8.00 /hr

In [148]:
paid_hour_paided = []

for i, job in enumerate(soup):
    info = job.select_one("strong[data-qa='client-hourly-rate']")
    if info != None:
        info = info.text
        info = treat_string(info)
        paid_hour_paided.append((i, info))
    
paid_hour_paided[0]

(0, '20.00')

In [149]:
paid_hour_paided[20]

(34, '40.49')

#### Total Spent - Ex: \$ 45K (or) \$ 6.2K

In [150]:
total_spent = []

for i, job in enumerate(soup):
    info = job.select_one("strong[data-qa='client-spend']")
    if info != None:
        info = info.text
        info = treat_string(info)
        total_spent.append((i, info))
    
total_spent[0]

(0, '6.2')

In [151]:
total_spent[160]

(229, '5.6')

In [152]:
client_features = []

for i, job in enumerate(soup):
    job_features = job.select_one('ul.cfe-ui-job-about-client-visitor')
    if job_features != None:
        client_features.append(job_features)
    
len(client_features)

880

#### Domain

In [153]:
def treat_domain_string(text):
    text = text.replace("\n", " ")
    text = text.strip()
    text = text.split("&")
    text = [t.strip() for t in text]
    return text

In [154]:
domains = []

for i, job in enumerate(soup):
    info = job.select_one("strong[data-qa='client-company-profile-industry']")
    if info != None:
        info = info.text
        info = treat_domain_string(info)
        domains.append((i, info))
        
domains[0]

(1, ['Sales', 'Marketing'])

In [155]:
len(domains)

341

In [156]:
len(domains)

341

In [157]:
client_texts = {}

for i, job in enumerate(client_features):
    headers = job.find_all("strong")
    header_features = [h.text for h in headers]
    client_texts[i] = header_features

In [158]:
new_cft = {}

for key, values in client_texts.items():
    new_values = [treat_string(text) for text in values]
    new_cft[key] = new_values

In [159]:
new_cft[1]

['', '80', '43', '8.38', '']

## Job Features

In [160]:
job_features_elements = []

for i, job in enumerate(soup):
    job_features = job.select_one('ul.cfe-ui-job-features')
    if job_features != None:
        job_features_elements.append(job_features)
    
len(job_features_elements)

880

#### Prices

In [161]:
def hour_string(text):
    text = text.strip()
    text = text.replace("$", "")
    text = text.split("-")
    return text

def fixed_string(text):
    text = text.strip()
    text = text.replace("$", "")
    text = re.sub("[^0-9.]", "", text)
    return text

In [162]:
fixed_prices, hour_prices = [], []

for i, job in enumerate(job_features_elements):
    info = job.find(lambda tag: tag.name == 'strong' and '$' in tag.text)
    if info != None:
        if info.find('span'):
            for span in info.find_all('span'):
                span.unwrap()
            info = hour_string(info.text)
            hour_prices.append((i, info))
        else:
            info = fixed_string(info.text)
            fixed_prices.append((i, info))
        

In [163]:
hour_prices[20]

(45, ['40.00', '75.00'])

#### Experience

In [164]:
experiences = []

for i, job in enumerate(job_features_elements):
    info = job.select_one("div[data-cy='expertise']").find_parent()
    if info != None:
        info = info.find("strong").text
        experiences.append((i, info))

In [165]:
experiences[0]

(0, 'Expert')

#### Local Job

In [166]:
local_jobs = []

for i, job in enumerate(job_features_elements):
    info = job.select_one("div[data-cy='local']").find_parent()
    if info != None:
        info = info.find("strong").text
        local_jobs.append((i, info))

In [167]:
local_jobs[300]

(300, 'Remote Job')

## Creating DataFrame

#### Some treatment

In [186]:
def treat_title_desc(text):
    text = text.replace("\n", "")
    text = text.strip()
    
    # Emoji Remotion
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

titles = [treat_title_desc(title) for title in titles]
descriptions = [treat_title_desc(desc) for desc in descriptions]

In [169]:
size = 880

In [170]:
def complete_list(info_list):
    keys = [info_list[key][0] for key in range(len(info_list))]
    for i in range(size):
        if i not in keys:
            info_list.append((i, None))
            
    info_list = sorted(info_list, key=lambda x: x[0])
    info_list = [x[1] for x in info_list]
    return info_list

In [171]:
countries = complete_list(countries)
len(countries)

880

In [172]:
jobs_posted = complete_list(jobs_posted)
len(jobs_posted)

880

In [173]:
paid_hour_paided = complete_list(paid_hour_paided)
len(paid_hour_paided)

880

In [174]:
total_spent = complete_list(total_spent)
len(total_spent)

880

In [175]:
domains = complete_list(domains)

In [176]:
len(domains)

880

In [177]:
fixed_prices = complete_list(fixed_prices)
len(fixed_prices)

880

In [178]:
hour_prices = complete_list(hour_prices)
len(hour_prices)

880

In [179]:
local_jobs = complete_list(local_jobs)
len(local_jobs)

880

In [180]:
experiences = complete_list(experiences)
len(experiences)

880

In [187]:
upwork = pd.DataFrame({
    "Title": titles,
    "Description": descriptions,
    "Skills": skills_list,
    "Source Country": countries,
    "Client Porpouses": jobs_posted,
    "Average Hour Paid ($)": paid_hour_paided,
    "Total Spent (K$)": total_spent,
    "Domain of Job": domains,
    "Fixed Price ($)": fixed_prices,
    "Price per Hour ($)": hour_prices,
    "Work on": local_jobs,
    "Experience Required": experiences
})

upwork.head()

Unnamed: 0,Title,Description,Skills,Source Country,Client Porpouses,Average Hour Paid ($),Total Spent (K$),Domain of Job,Fixed Price ($),Price per Hour ($),Work on,Experience Required
0,Webcrawler to collect data of all escaperooms ...,For a specific project we need as much informa...,"[Data Scraping, Data Mining, Data Extraction, ...",Netherlands,53,20.0,6.2,,300.0,,Remote Job,Expert
1,Build a Browse AI Robot to Multiple Sites to E...,Im looking to use the Browse AI software to ex...,"[Artificial Intelligence, Data Scraping, Data ...",United States,80,8.38,43.0,"[Sales, Marketing]",,,Remote Job,Intermediate
2,"Google Analytics, GA4 Migration, and Set-up",We are seeking an experienced Google Analytics...,"[Marketing Analytics, Operations Analytics, Go...",United States,15,46.82,6.4,,,"[60.00, 80.00]",Remote Job,Expert
3,Google Trends API - Python Notebook,Build a Google Collab Notebook that queries th...,"[API, Python, Jupyter, Google Trends]",Australia,117,16.7,393.0,,200.0,,Remote Job,Intermediate
4,Data analysis in excel,I have a spreadsheet of raw data from a labor ...,"[Microsoft Excel, Data Analysis]",United States,4,17.68,818.0,,,"[5.00, 20.00]",Remote Job,Entry level


In [188]:
upwork.to_csv("../datasets/upwork_dataScience.csv", index=False)