In [186]:
import pandas as pd
import glob
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def csvFolderToDataframe(path):
    all_files = glob.glob(os.path.join(path , "*.csv"))
    
    li = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)

    frame = pd.concat(li, axis=0, ignore_index=True)
    return frame

def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=1):
    """
    :param df_1: the left table to join
    :param df_2: the right table to join
    :param key1: key column of the left table
    :param key2: key column of the right table
    :param threshold: how close the matches should be to return a match, based on Levenshtein distance
    :param limit: the amount of matches that will get returned, these are sorted high to low
    :return: dataframe with boths keys and matches
    """
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

In [187]:
def processGetro():
    frame = csvFolderToDataframe(r'D:\GitHub\crypto-jobs-scraper\node-scraper\data\getro')
    deduplicated = (frame.drop_duplicates(subset='Job Link', keep='first', inplace=False)
                    .drop_duplicates(subset='Getro ObjectID', keep='first', inplace=False))

    header = ["Company Name", "Job Link", "Job Location", "Job Title", "Salary Range", "Tags", "Posted Before"]
    deduplicated.to_csv(r'D:\GitHub\crypto-jobs-scraper\data\getro\all_jobs.csv', columns = header, index=False)
    return deduplicated
    
getro = processGetro()

In [188]:
def processConsider():
    frame = csvFolderToDataframe(r'D:\GitHub\crypto-jobs-scraper\node-scraper\data\consider')
    deduplicated = (frame.drop_duplicates(subset='Job Link', keep='first', inplace=False)
                    .drop_duplicates(subset='Consider JobID', keep='first', inplace=False))
    header = ["Company Name", "Job Link", "Job Location", "Job Title", "Salary Range", "Tags", "Posted Before"]
    deduplicated.to_csv(r'D:\GitHub\crypto-jobs-scraper\data\consider\all_jobs.csv', columns = header, index=False)
    return deduplicated
    
consider = processConsider()

In [189]:
web3Career = csvFolderToDataframe(path = r'D:\GitHub\crypto-jobs-scraper\data\web3_careers\page_data')

In [190]:
cryptoJobsList = pd.read_csv(r'D:\GitHub\crypto-jobs-scraper\data\crypto_jobs_list\all_jobs.csv', index_col=None, header=0)

In [191]:
print(getro.shape, consider.shape, web3Career.shape, cryptoJobsList.shape)

(7085, 10) (1584, 10) (17458, 7) (268, 4)


In [192]:
getroAndConsiderJobs = pd.concat([getro, consider])
header = ["Company Name", "Job Link", "Job Location", "Job Title", "Salary Range", "Tags", "Posted Before"]
vcBoardJobs = getroAndConsiderJobs.drop_duplicates(subset='Job Link', keep='first', inplace=False)[header]
vcBoardJobs.to_csv(r'D:\GitHub\crypto-jobs-scraper\data\vc-job-boards\all_jobs.csv', columns = header, index=False)

In [194]:
concatenatedJobs = pd.concat([vcBoardJobs, web3Career, cryptoJobsList])

In [195]:
allJobs = concatenatedJobs.drop_duplicates(subset='Job Link', keep='first', inplace=False)

In [196]:
print(concatenatedJobs.shape, allJobs.shape)

(25781, 7) (25514, 7)


In [197]:
# Top 100 Blockchain Companies (from Crunchbase)
top100Companies = ["Coinbase","CoinDCX","Animoca Brands","Polygon","CertiK","CoinList","NYDIG","Axie Infinity","Terra","OpenSea","Argent","LayerZero Labs","Algorand","Helium","Blockdaemon","NEAR Protocol","DFINITY","2TM","Circle","21Shares","StarkWare Industries","Celsius Network","Figment","0x","Securitize","Ava Labs","Brave","5ire","Ethereum Foundation","Blockchain Capital","Abra","Dapper Labs","Flipside Crypto","BlockTower Capital","Fasset","WonderFi","Yield Guild Games Southeast Asia","Yuga Labs","Bitpanda","Aave","Crowdz","Decentraland","Optimism","Prime Trust","Aptos","BitMart","BlockApps","Mina","Drip Capital","Figure","Community Gaming","RockX","Stellar Development Foundation","bitsCrunch","Functionland","STEPN","Ankr","Boba Network","Blockmetrix","Bitwise","dYdX","RealBlocks","Rarible","Enjin","Okcoin","Venly","Dfns","Roll","Minka","Recur","KuCoin","Unstoppable Domains","TRM Labs","Spruce","Chainlink","Vauld","Huobi","Harmony","CoinTracker","Core Scientific","Arweave","ConsenSys","Wirex","Tangany","Forte","Mythical Games","Zenith Chain","BlockFi","Storj","Ancient8","Aleo","OneOf","Stacks","Propy","Dune Analytics","Hyperithm","Gauntlet","Celo","Chia Network","Yield Guild Games"]
top100CompaniesDf = pd.DataFrame({'name': top100Companies })

In [198]:
jobsPerCompany = allJobs.groupby(['Company Name'])[['Company Name']].size().reset_index(name='count').sort_values(by='count', ascending=False)

In [200]:
finalDf = fuzzy_merge(top100CompaniesDf, jobsPerCompany, 'name', 'Company Name', threshold=80, limit=1)

In [201]:
joined = finalDf.merge(jobsPerCompany, left_on='matches', right_on='Company Name', how='left').fillna('')

In [202]:
joined.to_csv('matched_data.csv', index=False)