In [11]:
# 1. Read positions, keep those where Country is United States and State != Empty
# 2. Read individual user files, keep user_ids that end up processed in step 1
# 3. Read user education files, keep user_ids that end up processed in step 1

In [1]:
import pandas as pd
import numpy as np
import os
import gzip
import pyarrow.parquet as pq
import glob
from itertools import zip_longest
import time
from IPython.display import display, clear_output

In [3]:
%%time

# Read and compile position files, keep needed variables
folder_path = 'D:\Dropbox\LinkedInData\\individual_position\\'
individual_position = pd.DataFrame()
output_folder = 'D:\Dropbox\LinkedInData\\individual_position\\'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get a list of files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Loop over Parquet files, keep positions where country is United States, and State is not empty
i = 0
for file_name in file_list:
    if i > -1:
        message = f"Processing iteration {i + 1} out of {len(file_list)}"
        display(message)
        clear_output(wait=True)
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_parquet(file_path,engine='auto')
        df = df[df['country'] == 'United States']
        df = df[df['state'] != 'empty']
        df = df[['user_id', 'rcid','state','startdate','enddate', 'location_raw', 'seniority']]
        df['file'] = i+1
        individual_position = pd.concat([individual_position, df], ignore_index=True)
    i+=1
    
individual_position.to_csv('D:\Dropbox\LinkedInData\\individual_position\\individual_position_all.csv',index=False)
print(f"Processed {i} of", len(file_list), "files")

Processed 2012 of 2012 files
CPU times: total: 1h 42min 40s
Wall time: 4h 3min 55s


KeyboardInterrupt: 

In [23]:
%%time

# Read individual user files

folder_path = 'D:\Dropbox\LinkedInData\\individual_user\\'
individual_user = pd.DataFrame()
user_ids = individual_position['user_id'].unique().tolist()
output_folder = '.\\Temp\\individual_user\\'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get a list of files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Loop over Parquet files
i = 0
for file_name in file_list:
    message = f"Processing iteration {i + 1} out of {len(file_list)}"
    display(message)
    clear_output(wait=True)
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_parquet(file_path,engine='auto')
    df = df[['user_id', 'firstname', 'lastname','f_prob','m_prob','white_prob','black_prob','hispanic_prob','native_prob','multiple_prob','highest_degree','sex_predicted','ethnicity_predicted','numconnections']]
    df['file'] = i+1
    individual_user = pd.concat([individual_user, df], ignore_index=True)
    i+=1
    if i % 100 == 0:
        individual_user = individual_user[individual_user['user_id'].isin(user_ids)]
    if i % 500 == 0:
        individual_user.to_csv(f'D:\Dropbox\LinkedInData\\individual_user\\individual_user_{i}.csv', index=False)
        individual_user = pd.DataFrame()

    
individual_user = individual_user[individual_user['user_id'].isin(user_ids)]
individual_user.to_csv(f'D:\Dropbox\LinkedInData\\individual_user\\individual_user_{i}.csv', index=False)
print(f"Processed {i} of", len(file_list), "files")


# Compile all individual_user files in temp 
folder_path = 'D:\Dropbox\LinkedInData\\individual_user\\individual_user_csv\\'
individual_user_all = pd.DataFrame()
file_list = [f for f in os.listdir(folder_path) if f.startswith('individual_user')]
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)
    individual_user_all = pd.concat([individual_user_all, df], ignore_index=True)
    os.remove(file_path)
    
individual_user_all = individual_user_all[individual_user_all['user_id'].isin(user_ids)]
individual_user_all.to_csv('D:\Dropbox\LinkedInData\\individual_user\\individual_user_csv\\individual_user_all.csv', index=False, encoding='utf-8')
print(f"Compiled all", len(file_list), "folders")

KeyboardInterrupt: 

In [26]:
%%time

# Read education files, keep user_id and startyear
folder_path = 'D:\Dropbox\LinkedInData\\individual_user_education\\'
individual_user_education = pd.DataFrame()

# Get a list of files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

i = 0
# Loop over Parquet files
for file_name in file_list:
    message = f"Processing iteration {i + 1} out of {len(file_list)}"
    display(message)
    clear_output(wait=True)
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_parquet(file_path,engine='auto')
    df = df[['user_id','startdate']]
    df = df.dropna(subset=['startdate'])
    df['year'] = df['startdate'].astype(str).str[:4]
    df['year'] = pd.to_numeric(df['year'], errors='coerce').astype('Int64')
    df = df.dropna(subset=['year'])
    df = df[['user_id','year']]
    df['file'] = i+1
    individual_user_education = pd.concat([individual_user_education, df], ignore_index=True)
    i+=1
    
individual_user_education = individual_user_education[individual_user_education['user_id'].isin(user_ids)]
min_value_idx = individual_user_education.groupby('user_id')['year'].idxmin()
individual_user_education = individual_user_education.loc[min_value_idx]
individual_user_education = individual_user_education.rename(columns={'year': 'collegeyear'})
individual_user_education.to_csv("D:\Dropbox\LinkedInData\\individual_user_education\\individual_user_education_all.csv",index=False)

print(f"Processed {i} of", len(file_list), "files")

Processed 258 of 258 files
CPU times: total: 6min 47s
Wall time: 28min 14s


In [2]:
%%time

# Read company files, keep user_id and startyear
folder_path = 'D:\Dropbox\LinkedInData\\company_ref\\'
company_ref = pd.DataFrame()

# Get a list of files in the folder
file_list = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

i = 0
# Loop over Parquet files
for file_name in file_list:
    message = f"Processing iteration {i + 1} out of {len(file_list)}"
    display(message)
    clear_output(wait=True)
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_parquet(file_path,engine='auto')
    df = df[['company', 'rcid', 'ultimate_parent_rcid', 'naics_code']]
    df['file'] = i+1
    company_ref = pd.concat([company_ref, df], ignore_index=True)
    i+=1
    
company_ref.to_csv("D:\Dropbox\LinkedInData\\company_ref\\company_ref_all.csv",index=False)
print(f"Processed {i} of", len(file_list), "files")

Processed 32 of 32 files
CPU times: total: 1min 33s
Wall time: 2min 1s
