In [4]:
import pandas as pd

In [1]:
DATA_DIR = '../../../data/LinkedIn'
JOB_DIR = f'{DATA_DIR}/raw/job_details'
COMPANY_DIR = f'{DATA_DIR}/raw/company_details'

In [None]:
import resource
# 8GB in bytes
memory_limit = 16 * 1024 * 1024 * 1024
# Set the memory limit
resource.setrlimit(resource.RLIMIT_DATA, (memory_limit, memory_limit))

## Data Descriptions

### company_details/companies.csv

```
company_id: The company ID as defined by LinkedIn
name: Company name
description: Company description
company_size: Company grouping based on number of employees (0 Smallest - 7 Largest)
country: Country of company headquarters.
state: State of company headquarters.
city: City of company headquarters.
zip_code: ZIP code of company's headquarters.
address: Address of company's headquarters
url: Link to company's LinkedIn page
```

## company_details/employee_counts.csv

```
company_id: The company ID
employee_count: Number of employees at company
follower_count: Number of company followers on LinkedIn
time_recorded: Unix time of data collection
```

### Consolidate all job metadata into a single .csv file

In [5]:
df_job_benefits = pd.read_csv(f'{JOB_DIR}/benefits.csv')
df_job_industries = pd.read_csv(f'{JOB_DIR}/job_industries.csv')
df_job_skills = pd.read_csv(f'{JOB_DIR}/job_skills.csv')

In [14]:
df_job_benefits.head()

Unnamed: 0,job_id,inferred,type
0,3690843087,0,Medical insurance
1,3690843087,0,Dental insurance
2,3690843087,0,401(k)
3,3690843087,0,Paid maternity leave
4,3690843087,0,Disability insurance


In [15]:
df_job_industries.head()

Unnamed: 0,job_id,industry_id
0,3378133231,68
1,3497509795,96
2,3690843087,47
3,3691775263,112
4,3691779379,80


In [16]:
df_job_skills.head()

Unnamed: 0,job_id,skill_abr
0,3690843087,ACCT
1,3690843087,FIN
2,3691763971,MGMT
3,3691763971,MNFC
4,3691775263,MGMT


In [20]:
df_job_metadata = pd.merge(df_job_benefits, df_job_industries, on='job_id', how='inner')
df_job_metadata = pd.merge(df_job_metadata, df_job_skills, on='job_id', how='inner')
df_job_metadata.head()

Unnamed: 0,job_id,inferred,type,industry_id,skill_abr
0,3690843087,0,Medical insurance,47,ACCT
1,3690843087,0,Medical insurance,47,FIN
2,3690843087,0,Dental insurance,47,ACCT
3,3690843087,0,Dental insurance,47,FIN
4,3690843087,0,401(k),47,ACCT


In [24]:
df_job_metadata.to_csv(f'{DATA_DIR}/job_metadata.csv', index=False)

### Consolidate all company metadata into a single .csv file

In [4]:
df_companies = pd.read_csv(f'{COMPANY_DIR}/companies.csv')
df_company_industries = pd.read_csv(f'{COMPANY_DIR}/company_industries.csv')
df_company_specialties = pd.read_csv(f'{COMPANY_DIR}/company_specialities.csv')
df_company_employee_counts = pd.read_csv(f'{COMPANY_DIR}/employee_counts.csv')

In [26]:
df_companies.head()

Unnamed: 0,company_id,name,description,company_size,state,country,city,zip_code,address,url
0,1009,IBM,"At IBM, we do more than work. We create. We cr...",7.0,NY,US,"Armonk, New York",10504,International Business Machines Corp.,https://www.linkedin.com/company/ibm
1,1016,GE HealthCare,Every day millions of people feel the impact o...,7.0,0,US,Chicago,0,-,https://www.linkedin.com/company/gehealthcare
2,1021,GE Power,"GE Power, part of GE Vernova, is a world energ...",7.0,NY,US,Schenectady,12345,1 River Road,https://www.linkedin.com/company/gepower
3,1025,Hewlett Packard Enterprise,Official LinkedIn of Hewlett Packard Enterpris...,7.0,Texas,US,Houston,77389,1701 E Mossy Oaks Rd Spring,https://www.linkedin.com/company/hewlett-packa...
4,1028,Oracle,We’re a cloud technology company that provides...,7.0,Texas,US,Austin,78741,2300 Oracle Way,https://www.linkedin.com/company/oracle


In [27]:
df_company_industries.head()

Unnamed: 0,company_id,industry
0,81149246,Higher Education
1,10033339,Information Technology & Services
2,6049228,Accounting
3,2641066,Electrical & Electronic Manufacturing
4,96649998,Marketing & Advertising


In [29]:
df_company_specialties.head()

Unnamed: 0,company_id,speciality
0,81149246,Childrens Music Education
1,81149246,Foundational Music Theory
2,81149246,Child Music Lessons
3,81149246,social emotional learning
4,81149246,social emotional development


In [30]:
df_company_employee_counts.head()

Unnamed: 0,company_id,employee_count,follower_count,time_recorded
0,81149246,6,91,1692645000.0
1,10033339,3,187,1692645000.0
2,6049228,20,82,1692645000.0
3,2641066,45,2336,1692645000.0
4,96649998,0,2,1692645000.0


In [9]:
df_company_employee_counts.count()

company_id        15907
employee_count    15907
follower_count    15907
time_recorded     15907
dtype: int64

In [None]:
# Results in memory crash
df_company_metadata = pd.merge(df_company_industries, df_company_specialties, on='company_id', how='inner')
df_company_metadata = pd.merge(df_company_metadata, df_company_employee_counts, on='company_id', how='inner')
df_company_metadata.head()