In [1]:
import pandas as pd

In [2]:
df = pd.read_excel('data_jobs_salary_all.xlsx')

In [3]:
df.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,"Bennington, NE",via ZipRecruiter,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,Sudan,year,128050.0,,Cox Communications,"['sql', 'python', 'aws', 'pyspark', 'tableau',..."
1,Data Engineer,Data Engineer - MA,"Mesa, AZ",via Indeed,Full-time,False,Georgia,2023-03-13 12:51:23,True,True,United States,year,140000.0,,Worldgate LLC,"['sql', 'nosql', 'java', 'python', 'kafka', 's..."
2,Senior Data Analyst,Supervisory Information Technology Specialist ...,"Alexandria, VA",via ZipRecruiter,Full-time,False,"New York, United States",2023-07-05 07:03:38,True,False,United States,year,156000.0,,National Technical Information Service,
3,Machine Learning Engineer,Machine Learning Research Scientist,"Pittsburgh, PA",via Ai-Jobs.net,Full-time,False,"Illinois, United States",2023-04-13 16:05:41,False,True,United States,year,140000.0,,Bosch Group,"['pytorch', 'tensorflow']"
4,Data Scientist,"Data Scientist, AWS","Irving, TX",via Snagajob,Full-time and Part-time,False,"Texas, United States",2023-10-15 06:02:51,False,False,United States,hour,,39.795002,"Presidio, Inc.","['python', 'r', 'sql', 'c', 'aws', 'gcp', 'big..."


In [4]:
df.shape

(32672, 16)

### Unique `job_title_short`

In [5]:
job_titles = ( pd.Series(df.job_title_short.unique(), name = 'job_title')
              .sort_values()
              .reset_index(drop = True)
)


- The `autoincrement` values for the tables in SQL start from 1.
- The default index for the `pandas.Series` start from 0.
- By default, the mapping of the `job_title_id` values in the `job_postings` dataframe will start from 0 and will raise errors when inserting the data into the database.

Solution: to add 1 to the indexes of the series so they start from 0. 

In [6]:
job_titles.index = job_titles.index + 1

In [7]:
job_titles

1              Business Analyst
2                Cloud Engineer
3                  Data Analyst
4                 Data Engineer
5                Data Scientist
6     Machine Learning Engineer
7           Senior Data Analyst
8          Senior Data Engineer
9         Senior Data Scientist
10            Software Engineer
Name: job_title, dtype: object

In [8]:
job_titles.to_csv('./csv/job_titles.csv', index = False)

### Unique `company_name`

In [9]:
# df.company_name.nunique()

In [10]:
df.company_name = df.company_name.str.replace('\u200b', '').str.strip()

In [11]:
df.company_name.sort_values()

22939            #twiceasnice Recruiting
22429            #twiceasnice Recruiting
32014            #twiceasnice Recruiting
29447            #twiceasnice Recruiting
29123    (THE VANGUARD GROUP/MALVERN,PA)
                      ...               
22652                         zooplus SE
10615                                ztp
3245                               ЛАНИТ
18882           Технологическая компания
1759                           𝐎𝐩𝐞𝐧𝐬𝐭𝐚𝐟𝐟
Name: company_name, Length: 32672, dtype: object

In [12]:
df.loc[df.company_name.str.startswith('0'), 'company_name']

15279    0nward Select
16632    0nward Select
Name: company_name, dtype: object

`0nward select` has a zero as initial O

In [13]:
df.loc[df.company_name == '0nward Select', 'company_name'] = 'Onward Select'

*Openstaff* has a weird font and won't be recognized as simple string

In [14]:
df.company_name.sort_values().tail(5)

22652                  zooplus SE
10615                         ztp
3245                        ЛАНИТ
18882    Технологическая компания
1759                    𝐎𝐩𝐞𝐧𝐬𝐭𝐚𝐟𝐟
Name: company_name, dtype: object

In [15]:
df.loc[1759, 'company_name'] = 'Openstaff'

Get a text file with all the values for `company_name`

In [16]:
with open('df_companies.txt', 'w') as f:
    for value in df.company_name:
        f.write(f'{value}\n')

Process the file using the script `clean_companies.py` and then read the output file

In [17]:
import clean_companies as clean

In [18]:
# Process the file
clean_companies = clean.process_company_names('df_companies.txt', 'cleaned_companies.txt')

Original number of companies: 32672
Unique normalized companies: 9136


In [19]:
clean_companies

['Cox Communications',
 'Worldgate',
 'National Technical Information Service',
 'Bosch',
 'Presidio',
 'Lmi Consulting',
 'Multiplan',
 'Sports Info',
 'Arsenault',
 'TikTok',
 'Pinterest',
 'Riot Games',
 'Northwest Software',
 'TikTok',
 'TikTok',
 'Forsyth Barnes',
 'Emw Staffing',
 'Volt Technical Resources',
 'Us Army Corps Of Engineers',
 'Capital One',
 'Jobs Near Me',
 'Digirocks',
 'Sofi',
 'Gravity It Resources',
 'Harnham',
 'American Family Mutual Insurance',
 'TikTok',
 'Insight Global',
 'Abbvie',
 'Unitedhealth',
 'Citigroup',
 'Lumen',
 'Eliassen',
 'Motion Recruitment',
 'New York University',
 'Lmi Consulting',
 'Cox Communications',
 'Insight Global',
 'Idc',
 'Office Of General Counsel',
 'Northrop Grumman',
 'TikTok',
 'Kpmg',
 'M Science',
 'Cisco',
 'Federal Reserve System',
 'Kpmg',
 'Cottonwood Financial',
 'Acadia',
 'Aditi Consulting',
 'Cox Communications',
 'Paragon Alpha Hedge Fund Talent Business',
 'Sga',
 'Brooksource',
 'Transtach',
 'Lmi Consulting',

Compare the results

In [20]:
pd.DataFrame({'df' : df.company_name.head(), 'clean' : clean_companies[:5]})

Unnamed: 0,df,clean
0,Cox Communications,Cox Communications
1,Worldgate LLC,Worldgate
2,National Technical Information Service,National Technical Information Service
3,Bosch Group,Bosch
4,"Presidio, Inc.",Presidio


Finally, reassign the clean company names to the dataframe column

In [21]:
df.company_name = clean_companies

In [22]:
df.company_name.isna().sum()

np.int64(0)

Some `company_name` values are empty (they were Co. Inc., Company... and so on)

In [23]:
df.company_name[df.company_name == '']

1038     
2735     
4318     
24027    
25882    
27635    
29441    
31588    
Name: company_name, dtype: object

In [24]:
df.loc[df.company_name == '', 'company_name'] = 'unknown'

Fix some issues with values that are not estrictly duplicated, but will raise an error as duplicates when loading the data into the database.

In [25]:
df.company_name[df.company_name.str.contains('Mondel')]

16953    Mondelēz
28685    Mondelez
Name: company_name, dtype: object

In [26]:
df.company_name[df.company_name.str.contains('Nestl')]

838                     Nestlé
2240                    Nestlé
4120                    Nestlé
5843                    Nestlé
8896                    Nestlé
9238                    Nestlé
10826                   Nestlé
13061                   Nestle
15780                   Nestlé
16330                   Nestlé
17430                   Nestle
17688                   Nestlé
17784    Nestl Purina Pet Care
17821                   Nestle
21907                   Nestlé
23068                   Nestle
25319                   Nestlé
25890                   Nestle
30503                   Nestlé
Name: company_name, dtype: object

In [27]:
df.loc[df.company_name == 'Nestle', 'company_name'] = 'Nestlé'

In [28]:
df.loc[df.company_name.str.contains('Mondel'), 'company_name'] = 'Mondelez'

Get list of unique companies:

In [29]:
companies = pd.Series(df.company_name.unique(), name = 'company')

In [30]:
companies = companies.sort_values().reset_index(drop=True)
companies.index = companies.index + 1
companies.head()

1    1 Point System
2    1001 Absa Bank
3         100Insure
4      10X Genomics
5         16 Points
Name: company, dtype: object

In [31]:
companies.to_csv('./csv/companies.csv', index = False)

### Unique locations

In [32]:
# df.job_country.unique()

In [33]:
countries = pd.Series(df.job_country.unique())

I will add Georgia to the countries table.

In [34]:
countries = pd.concat([countries, pd.Series('Georgia')])
countries = countries.sort_values().reset_index(drop = True)
countries.index = countries.index + 1
countries.name = 'country'

In [35]:
# countries.head()

In [36]:
countries.to_csv('./csv/countries.csv', index = False)

#### Unique `job_location`

In [37]:
# df.job_location.unique()

In [38]:
df.job_location = df.job_location.str.strip().fillna('unknown')

There are a couple of duplicates:  
Vilnius, Vilnius City Municipality, Lithuania  
Vilnius, Vilnius city municipality, Lithuania

In [39]:
df.loc[df.job_location.str.contains('Vilnius'), 'job_location'] = 'Vilnius, Vilnius City Municipality, Lithuania'

In [40]:
job_locations = pd.Series(df.job_location.unique(), name = 'location_name')
job_locations = job_locations.sort_values().reset_index(drop = True)
job_locations.index = job_locations.index + 1
job_locations.head()

1                A Coruña, Spain
2                Aarhus, Denmark
3                Abbott Park, IL
4                      Abell, MD
5    Aberdeen Proving Ground, MD
Name: location_name, dtype: object

In [41]:
job_locations.to_csv('./csv/locations.csv', index = False)

Check if the search_locations are contained either in job_country or job_location

In [42]:
for s in df.search_location.unique():
    if s not in countries.values:
        print(s)

New York, United States
Illinois, United States
Texas, United States
California, United States
Florida, United States


I will set the `search_location` values which contain *United States* (State, United States) to just *United States*

In [43]:
df.loc[df.search_location.str.contains('United States'), 'search_location'] = 'United States'

### Unique `job_schedule_type`

In [44]:
# df.job_schedule_type.unique()

In [45]:
# df.job_schedule_type.head()

In [46]:
df.job_schedule_type = df.job_schedule_type.fillna('NULL')

Convert `job_schedule_type` to list of values so I can explode the dataset later

In [47]:
def convert_schedule_into_list(row):
    if row != 'NULL':
        row = row.replace(' and ', ',').replace(',,', ',').split(',')
        
    # for string with only one schedule, convert to list
    # and return a list
    if isinstance(row, str):
        return [row]
    
    return row

In [48]:
df.job_schedule_type = df.job_schedule_type.apply(convert_schedule_into_list)

In [49]:
all_schedules = []
for i in df.job_schedule_type:
    all_schedules += i
    unique_schedules = list(set(all_schedules))
unique_schedules = [i.strip() for i in unique_schedules if i != 'NULL' ]

schedules = (pd.Series(unique_schedules, name='schedule_type')
             .drop_duplicates()
             .sort_values()
             .reset_index(drop = True)
)
schedules.index = schedules.index + 1

In [50]:
# schedules

In [51]:
schedules.to_csv('./csv/job_schedules.csv', index = False)

### Unique `job_via`

In [52]:
df.job_via = df.job_via.fillna('unknown')

In [53]:
df.job_via = df.job_via.str.replace('via ', '').str.strip()

Check the values of `job_via` so I can find a pattern to clean the data.

In [54]:
# lowercase_job_via = df.job_via.str.lower().drop_duplicates().sort_values(key = lambda x : -x.str.len())

In [55]:
# print(*lowercase_job_via, sep='\n')

In [56]:
# print(*df.job_via.str.lower().drop_duplicates().sort_values(), sep='\n')

Let's check if there are variations of the main job portals:

In [57]:
# famous_job_portals = ['ZipRecruiter', 'Indeed', 'LinkedIn', 'Snagajob', 'Ai-Jobs.net', 
#         'Ladders', 'Dice', 'JobServe', 'Upwork', 'BeBee', 
#         'Built In', 'ProActuary', 'Remote OK', 'Get.It']
# famous_job_portals = [i.lower() for i in famous_job_portals]

In [58]:
# for i in famous_job_portals:
#     for j in df.job_via.str.lower().unique():
#         if i in j:
#             print(j)

There are some variations of the job portals `BeBee`, `Indeed`, `JobServe` and `Built In`

In [59]:
substituions = {
    'geebo' : 'geebo.com',
    'talentify' : 'Talentify',
    'tarta.ai' : 'tarta.ai',
    'linkedin' : 'LinkedIn',
    'bebee' : 'BeBee',
    'jobserve' : 'JobServer',
    'indeed' : 'Indeed',
    'built in' : 'Built In',
    'dice' : 'dice.com'
    }

for key, value in substituions.items():
    df.loc[df.job_via.str.lower().str.contains(key), 'job_via'] = value

In [60]:
df.loc[df.job_via.str.lower().str.contains('informs.org'), 'job_via'] = 'informs.org'

In [61]:
df.job_via = df.job_via.str.replace('Www.', 'www.')

In [62]:
df.job_via[df.job_via.str.endswith('-')]

24345    InternsVilla | Hub Of Internships -
Name: job_via, dtype: object

In [63]:
df.loc[df.job_via == 'InternsVilla | Hub Of Internships -', 'job_via'] = 'InternsVilla | Hub Of Internships'

Create the list of unique `job_via` values

In [64]:
job_via = pd.Series( df.job_via.unique() , name = 'job_via')

In [65]:
job_via = pd.Series(job_via, name = 'job_via').sort_values().reset_index(drop = True)
job_via.index = job_via.index + 1

In [66]:
job_via.to_csv('./csv/job_via.csv', index = False)

### Unique `job_skills`

In [67]:
skills = list(df.job_skills.fillna('NULL').unique())

In [68]:
skills = [i.replace('[', '')
          .replace(']', '')
          .replace("'", '')
          for i in skills if i != 'NULL']

In [69]:
skills_expanded = ', '.join(skills).split(', ')
skills = list(set(skills_expanded))

In [70]:
job_skills = pd.Series(skills, name = 'skill').sort_values().reset_index(drop = True)
job_skills.index = job_skills.index + 1

In [71]:
# job_skills

In [72]:
job_skills.to_csv('./csv/skills.csv', index = False)

Explode the original dataset to generate a row for every skill required in the job posting.

In [73]:
def expand_skills(row):
    if not isinstance(row, float):
        skills_list = (row.replace('[', '')
                    .replace(']', '')
                    .replace("'", '')
        )
        skills_list = skills_list.split(', ')
        return skills_list
    else:
        return row

In [74]:
df.job_skills = df.job_skills.apply(expand_skills)

### Explode the dataset by `job_skills` and `job_schedule_type`

In [75]:
df2 = df.explode('job_schedule_type')
df2 = df2.explode('job_skills')

### Normalize `job_country`

In [76]:
s = pd.Series(countries.index, index = countries.values)
df2.job_country = df2.job_country.map(s)

### Normalize `job_location`

In [77]:
job_locations = job_locations.dropna()

In [78]:
s = pd.Series(job_locations.index, index = job_locations.values)
df2.job_location = df2.job_location.map(s)

### Normalize `job_via`

In [79]:
job_via

1                1840 & Company
2      247 Careers For Freshers
3                    AEC Living
4                    AI Careers
5                   AITechTrend
                 ...           
298               www.hireme.ai
299           www.joblatter.net
300             www.worklis.com
301                          领英
302                      领英(中国)
Name: job_via, Length: 302, dtype: object

In [80]:
s = pd.Series(job_via.index, index = job_via.values)
df2.job_via = df2.job_via.map(s)

In [81]:
df2.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,207,291,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,95,year,128050.0,,Cox Communications,sql
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,207,291,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,95,year,128050.0,,Cox Communications,python
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,207,291,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,95,year,128050.0,,Cox Communications,aws
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,207,291,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,95,year,128050.0,,Cox Communications,pyspark
0,Senior Data Scientist,Senior Data Scientist Data and Analytics Perfo...,207,291,Full-time,False,Sudan,2023-04-24 09:51:15,False,True,95,year,128050.0,,Cox Communications,tableau


### Normalize `job_schedule_type`

In [82]:
s = pd.Series(schedules.index, index = schedules.values)
df2.job_schedule_type = df2.job_schedule_type.map(s)

### Normalize `search_location`

In [83]:
s = pd.Series(countries.index, index = countries.values)
df2.search_location = df2.search_location.map(s)

### Normalize `job_company`

In [84]:
s = pd.Series(companies.index, index = companies.values)
df2.company_name = df2.company_name.map(s)

### Normalize `job_skills`

In [85]:
s = pd.Series(job_skills.index, index = job_skills.values)
df2.job_skills = df2.job_skills.map(s)

### Normalize `job_title_short`

In [86]:
s = pd.Series(job_titles.index, index = job_titles.values)
df2.job_title_short = df2.job_title_short.map(s)

In [87]:
df2.sample(4)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills
23143,3,Sr Data Analyst,131,222,2.0,False,107,2023-08-10 08:01:27,False,False,107,hour,,16.51,8095,82.0
22770,4,Data Engineer - Contract to Hire,86,248,1.0,True,107,2023-08-23 13:08:37,True,False,107,hour,,22.5,8421,146.0
29087,6,"Machine Learning Engineer AI, Analytics & Data",2207,15,2.0,False,96,2023-07-15 09:24:40,False,False,96,year,141000.0,,3490,69.0
4906,4,Big Data Engineer (GCP) - Now Hiring,598,222,2.0,False,39,2023-08-08 14:53:27,False,False,107,hour,,42.195,7378,139.0


In [88]:
df2.dtypes

job_title_short                   int64
job_title                        object
job_location                      int64
job_via                           int64
job_schedule_type               float64
job_work_from_home                 bool
search_location                   int64
job_posted_date          datetime64[ns]
job_no_degree_mention              bool
job_health_insurance               bool
job_country                       int64
salary_rate                      object
salary_year_avg                 float64
salary_hour_avg                 float64
company_name                      int64
job_skills                      float64
dtype: object

In [89]:
df2 = df2.sort_values('job_posted_date').reset_index(drop = True)

In [90]:
df2.rename(columns = {
    'job_title_short' : 'job_title_id',
    'job_title' : 'job_title_full',
    'job_location' : 'job_location_id',
    'job_via' : 'job_via_id',
    'job_schedule_type' : 'schedule_id',
    'search_location' : 'search_location_id',
    'job_location' : 'job_location_id',
    'job_work_from_home' : 'work_from_home',
    'job_no_degree_mention' : 'no_degree_mention',
    'job_health_insurance' : 'health_insurance',
    'job_country' : 'job_country_id',
    'company_name' : 'company_id',
    'job_skills' : 'skill_id'
}, inplace = True)

In [91]:
df2.to_csv('./csv/job_postings.csv', index = False)