In [1]:
#Import libraries and packages
from bs4 import BeautifulSoup 
import requests
import random
import numpy as np
import pandas as pd
import re

In [2]:
#This code searches the available jobs according to the search keywords
search_keywords = ["python","programming", "data science", "machine learning", "artificial intelligence", "web development"]
max_page_number = 10

#initialize dictionary to contain the scrapped data
job_dict = {}
job_dict["company"] = []
job_dict["location"] = []
job_dict["skills"] = []
job_dict["position"] = []
job_dict["experience"] = []

#loop over the search keywords
for keyword in search_keywords:
    url = f"https://www.timesjobs.com/candidate/job-search.html?from=submit&actualTxtKeywords=python&searchBy=0&rdoOperator=OR&searchType=personalizedSearch&luceneResultSize=150&postWeek=60&txtKeywords={keyword}&pDate=I&sequence=1&startPage=1"
    url_data = requests.get(url).text
    soup = BeautifulSoup(url_data, "html.parser")
    jobs = soup.find_all("li", class_ = "clearfix job-bx wht-shd-bx")
    for job in jobs:
        job_dict["company"].append(job.find("h3", class_ = "joblist-comp-name").text.strip())
        job_dict["location"].append(job.find("span").text.strip())
        job_dict["skills"].append(job.find("span", class_ = "srp-skills").text.strip())
        job_dict["position"].append(job.find("h2").text.strip())
        exp = job.find("ul", class_= "top-jd-dtl clearfix")
        job_dict["experience"].append(re.findall("[0-9].*", exp.find("li").text.strip()))

In [42]:
#save data into pandas dataframe
df = pd.DataFrame(job_dict)

In [43]:
#Split skills and add every skill in a column
df_skills = df.skills.str.split(",", expand = True)
df1 = pd.concat([df, df_skills], axis = 1)
df1 = df1.drop("skills", axis = 1)

In [44]:
#Convert the experience column to include only the range of required years
df1.experience = df1.experience.astype("str")
df1.experience = df1.experience.str.extract(r"([0-9].*[0-9])")

In [45]:
#Separate experience to min and max required experience
df1["Experience_min"] = df1.experience.str[0]
df1["Experience_max"] = df1.experience.str[4]

In [46]:
#Preview dataframe
df1.head()

Unnamed: 0,company,location,position,experience,0,1,2,3,4,5,...,21,22,23,24,25,26,27,28,Experience_min,Experience_max
0,Pure Tech Codex Private Limited,Pune,Python,2 - 3,rest,python,database,django,debugging,mongodb,...,,,,,,,,,2,3
1,Surya Informatics Solutions Pvt. Ltd.,Chennai,Python,0 - 3,python,web technologies,linux,mobile,mysql,angularjs,...,,,,,,,,,0,3
2,RESEARCH DEVELOPERS\r\n (More Jobs),(More Jobs),"Python Developer , Python Programmer",0 - 3,python,research,python programmer,Machine Learning,Pattern Recognition,Image Processing,...,,,,,,,,,0,3
3,Perfios Software,Bengaluru / Bangalore,Python Developer,5 - 8,python,java,scala,,,,...,,,,,,,,,5,8
4,Triadss Tech Solutions,Chennai,Python Developer,0 - 3,python,django,html5,javascript,,,...,,,,,,,,,0,3


In [47]:
print(f"Number of duplicated values is: {df1[df1.duplicated()].shape[0]}")

Number of duplicated values is: 79


In [48]:
#Remove any duplicated rows (highly likely to have duplicates because we used several keywords)
df1.drop_duplicates(inplace = True)
df1.head()

Unnamed: 0,company,location,position,experience,0,1,2,3,4,5,...,21,22,23,24,25,26,27,28,Experience_min,Experience_max
0,Pure Tech Codex Private Limited,Pune,Python,2 - 3,rest,python,database,django,debugging,mongodb,...,,,,,,,,,2,3
1,Surya Informatics Solutions Pvt. Ltd.,Chennai,Python,0 - 3,python,web technologies,linux,mobile,mysql,angularjs,...,,,,,,,,,0,3
2,RESEARCH DEVELOPERS\r\n (More Jobs),(More Jobs),"Python Developer , Python Programmer",0 - 3,python,research,python programmer,Machine Learning,Pattern Recognition,Image Processing,...,,,,,,,,,0,3
3,Perfios Software,Bengaluru / Bangalore,Python Developer,5 - 8,python,java,scala,,,,...,,,,,,,,,5,8
4,Triadss Tech Solutions,Chennai,Python Developer,0 - 3,python,django,html5,javascript,,,...,,,,,,,,,0,3


In [63]:
#Modify the dataframe column names
new_names = []
for name in df1.columns:
    if type(name) == int:
        new_names.append(f"skill_{name+1}")
    else:
        new_names.append(name)
        
#Reorder the dataframe to have the min and max experience before the skills
new_names = new_names[:4] + new_names[-2:] + new_names[4:-3]

df_updated = df1[new_names]

In [70]:
df_updated.head()

Unnamed: 0,company,location,position,experience,Experience_min,Experience_max,skill_1,skill_2,skill_3,skill_4,...,skill_19,skill_20,skill_21,skill_22,skill_23,skill_24,skill_25,skill_26,skill_27,skill_28
0,Pure Tech Codex Private Limited,Pune,Python,2 - 3,2,3,rest,python,database,django,...,,,,,,,,,,
1,Surya Informatics Solutions Pvt. Ltd.,Chennai,Python,0 - 3,0,3,python,web technologies,linux,mobile,...,,,,,,,,,,
2,RESEARCH DEVELOPERS\r\n (More Jobs),(More Jobs),"Python Developer , Python Programmer",0 - 3,0,3,python,research,python programmer,Machine Learning,...,,,,,,,,,,
3,Perfios Software,Bengaluru / Bangalore,Python Developer,5 - 8,5,8,python,java,scala,,...,,,,,,,,,,
4,Triadss Tech Solutions,Chennai,Python Developer,0 - 3,0,3,python,django,html5,javascript,...,,,,,,,,,,


In [71]:
#Export data to csv file
df_updated.to_csv("jobsdata.csv")