# Wordrank Algorithm

Basic wordrank algorithm for FOSL dataset.
Currently only works for the English language. For other languages you will need to add/subtract things to the pruning step to fit that language's grammar.

In [1]:
import pandas as pd

In [2]:
# Import sample dataset
# I made the description all lowercase since case sensitivity introduces tons of bugs
df = pd.read_csv("Contacts From Web Confrences.csv")
df['description'] = df['description'].str.lower()
df

Unnamed: 0,firstname,lastname,title,organization,description
0,Bill,Kemp,CEO,United Space Structures,commercialization of space – buildings & struc...
1,Boris,Goldstein,CEO,Grapheneca,graphene nanomaterials – applications in energ...
2,Billy,Barnwell,Founder & CEO,Climate Cure Capital,development of carbon negative technologies & ...
3,Irfan,Ahmad,"Executive Director, Interdisciplinary Initiatives",Grainger College of Engineering,health innovation at the intersection of engin...
4,JA,Colantonio,"Program Manager, Innovation & Entrepreneurship",National Renewable Energy Lab (NREL),new technologies & opportunities to work with ...
5,James,Santore,Director of Marketing & Social Media,Seventy Six Capital,"trends & updates from games, sports & e-sports..."
6,Ali,Khademhosseini,Director,Terasaki Institute for Biomedical Innovation,"combining genomics, proteomics, metabolomics &..."
7,Anna Jamell,Siefken,"Executive Director, Wliton Scott Institute for...",Carnegie Mellon University,breakthrough technologies to accelerate the tr...
8,Nicole,Lazzaro,President,XEODesign,making games that unlock human potential and i...
9,Rhonda,Stevenson,CEO,Orbital Assembly Corp,building gravity in space


In [3]:
# Join all descriptions so they form a block of text. Each description is separated with a space " "
text = " ".join(df.description)
text

'commercialization of space – buildings & structures on the moon graphene nanomaterials – applications in energy storage, healthy buildings, and more development of carbon negative technologies & trading carbon credits health innovation at the intersection of engineering & medicine new technologies & opportunities to work with nrel trends & updates from games, sports & e-sports leading vc investment firm combining genomics, proteomics, metabolomics & other -omics to create true personalized healthcare breakthrough technologies to accelerate the transition to a sustainable, low-carbon energy future making games that unlock human potential and improve quality of life through play building gravity in space transforming and revitalizing industrial commodities development and commercialization of injury preventive athletic footwear, sports equipment and products for health and well-being social commerce – apps & web-based tools to help people to spend their money in accordance with their va

In [4]:
# Turn the block of text into a list of individual words
words = text.split()
words

['commercialization',
 'of',
 'space',
 '–',
 'buildings',
 '&',
 'structures',
 'on',
 'the',
 'moon',
 'graphene',
 'nanomaterials',
 '–',
 'applications',
 'in',
 'energy',
 'storage,',
 'healthy',
 'buildings,',
 'and',
 'more',
 'development',
 'of',
 'carbon',
 'negative',
 'technologies',
 '&',
 'trading',
 'carbon',
 'credits',
 'health',
 'innovation',
 'at',
 'the',
 'intersection',
 'of',
 'engineering',
 '&',
 'medicine',
 'new',
 'technologies',
 '&',
 'opportunities',
 'to',
 'work',
 'with',
 'nrel',
 'trends',
 '&',
 'updates',
 'from',
 'games,',
 'sports',
 '&',
 'e-sports',
 'leading',
 'vc',
 'investment',
 'firm',
 'combining',
 'genomics,',
 'proteomics,',
 'metabolomics',
 '&',
 'other',
 '-omics',
 'to',
 'create',
 'true',
 'personalized',
 'healthcare',
 'breakthrough',
 'technologies',
 'to',
 'accelerate',
 'the',
 'transition',
 'to',
 'a',
 'sustainable,',
 'low-carbon',
 'energy',
 'future',
 'making',
 'games',
 'that',
 'unlock',
 'human',
 'potential',

In [5]:
# Set up stopwords: a common group of words that are not meaningful and should be removed
# I import a bunch of predetermined stopwords from another module. Then I add some of my own
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
stopwords.update(['building', 'transforming', 'cost', 'commodities', 'revitalizing',\
                  'commercialization', 'industrial', 'development', 'technologies', 'opportunities',\
                 'innovation', 'new', 'investment', 'low', 'systems', 'needs', 'battle', '–', '&', '-omics', '2020'])

In [6]:
stopwords

{'&',
 '-omics',
 '2020',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'also',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'battle',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'building',
 'but',
 'by',
 'can',
 "can't",
 'cannot',
 'com',
 'commercialization',
 'commodities',
 'cost',
 'could',
 "couldn't",
 'development',
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'else',
 'ever',
 'few',
 'for',
 'from',
 'further',
 'get',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'hence',
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'however',
 'http',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'industrial',
 'innovation',
 'into',
 'investment',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'k',
 "let's",
 'like',
 'low',
 '

In [7]:
# Prune stopwords from our list of words before sorting
words_ps = [x for x in words if x not in stopwords]

In [8]:
words_ps

['space',
 'buildings',
 'structures',
 'moon',
 'graphene',
 'nanomaterials',
 'applications',
 'energy',
 'storage,',
 'healthy',
 'buildings,',
 'carbon',
 'negative',
 'trading',
 'carbon',
 'credits',
 'health',
 'intersection',
 'engineering',
 'medicine',
 'work',
 'nrel',
 'trends',
 'updates',
 'games,',
 'sports',
 'e-sports',
 'leading',
 'vc',
 'firm',
 'combining',
 'genomics,',
 'proteomics,',
 'metabolomics',
 'create',
 'true',
 'personalized',
 'healthcare',
 'breakthrough',
 'accelerate',
 'transition',
 'sustainable,',
 'low-carbon',
 'energy',
 'future',
 'making',
 'games',
 'unlock',
 'human',
 'potential',
 'improve',
 'quality',
 'life',
 'play',
 'gravity',
 'space',
 'injury',
 'preventive',
 'athletic',
 'footwear,',
 'sports',
 'equipment',
 'products',
 'health',
 'well-being',
 'social',
 'commerce',
 'apps',
 'web-based',
 'tools',
 'help',
 'people',
 'spend',
 'money',
 'accordance',
 'values,',
 'beliefs',
 'priorities.',
 'high',
 'tech,',
 'cost,',
 

In [9]:
# Prune and sort words by frequency

# Create dictionary
d = {}
for word in words_ps:
    # Pruning
    # Prune off parentheses [] and ()
    if word.startswith('[') or word.startswith('('):
        word = word[1:]
    if word.endswith(']') or word.endswith(')'):
        word = word[:-1]
        
    # Prune off ending punctuation
    if word.endswith('.') or word.endswith(',') or word.endswith(';') or word.endswith(':'):
        word = word[:-1]
        
    # If word is already in dictionary, increase the value by 1
    # Otherwise add the word to dictionary with value 1
    if word in d:
        d[word] += 1
    else:
        d[word] = 1

# Order dictionary frequency in ascending order
# For this algorithm test I am only using one description group per person, so as the algorithm goes down the list
# it will replace the lower value key with a higher value one
d = dict(sorted(d.items(), key = lambda item: item[1]))

In [10]:
# Done
d

{'structures': 1,
 'moon': 1,
 'graphene': 1,
 'nanomaterials': 1,
 'applications': 1,
 'storage': 1,
 'healthy': 1,
 'negative': 1,
 'trading': 1,
 'credits': 1,
 'intersection': 1,
 'engineering': 1,
 'medicine': 1,
 'work': 1,
 'nrel': 1,
 'trends': 1,
 'updates': 1,
 'e-sports': 1,
 'leading': 1,
 'vc': 1,
 'firm': 1,
 'combining': 1,
 'genomics': 1,
 'proteomics': 1,
 'metabolomics': 1,
 'create': 1,
 'true': 1,
 'personalized': 1,
 'breakthrough': 1,
 'accelerate': 1,
 'transition': 1,
 'sustainable': 1,
 'low-carbon': 1,
 'future': 1,
 'making': 1,
 'unlock': 1,
 'human': 1,
 'potential': 1,
 'improve': 1,
 'quality': 1,
 'play': 1,
 'gravity': 1,
 'injury': 1,
 'preventive': 1,
 'athletic': 1,
 'footwear': 1,
 'equipment': 1,
 'products': 1,
 'well-being': 1,
 'social': 1,
 'commerce': 1,
 'apps': 1,
 'web-based': 1,
 'tools': 1,
 'help': 1,
 'people': 1,
 'spend': 1,
 'money': 1,
 'accordance': 1,
 'values': 1,
 'beliefs': 1,
 'priorities': 1,
 'high': 1,
 'tech': 1,
 'cost': 

# Using Wordrank to Create Groups

In [11]:
# Create description group
# This method is very slow, for large lists it will be unfeasable. Will need to replace with something more
# efficient down the line
for i in range(len(df)):
    for j in range(len(d)):
        df.loc[df['description'].str.contains(list(d.keys())[j]), 'dgroup'] = list(d.keys())[j]

In [12]:
# Sort df by description group, alphabetically
df_sort = df.sort_values('dgroup').reset_index()

# Done
df_sort

Unnamed: 0,index,firstname,lastname,title,organization,description,dgroup
0,15,Jeffrey,Arle,CEO,Intellihat Inc.,wearable technology designed to slow the effec...,aging
1,17,Tim,Brosnihan,Executive Director,MEMS & Sensors Industry Group,opportunities & needs in american semiconducto...,america
2,0,Bill,Kemp,CEO,United Space Structures,commercialization of space – buildings & struc...,buildings
3,2,Billy,Barnwell,Founder & CEO,Climate Cure Capital,development of carbon negative technologies & ...,carbon
4,14,Reyad,Fezzani,CEO,Carbon Collect Inc.,"mechanical trees, carbon collection & reuse, &...",carbon
5,7,Anna Jamell,Siefken,"Executive Director, Wliton Scott Institute for...",Carnegie Mellon University,breakthrough technologies to accelerate the tr...,carbon
6,18,Jeb,Linton,"CTO, Partner Ecosystem & Cognitive Security",IBM Watson Cloud,applied deep learning & decentralized systems ...,challenges
7,1,Boris,Goldstein,CEO,Grapheneca,graphene nanomaterials – applications in energ...,health
8,3,Irfan,Ahmad,"Executive Director, Interdisciplinary Initiatives",Grainger College of Engineering,health innovation at the intersection of engin...,health
9,19,Pamela,Contag,CEO,BioEclipse Therapeutics Inc.,precision immunotherapies to battle unmet heal...,healthcare


In [13]:
# Example: Find people in description group "health"
df[df.dgroup == 'health']

Unnamed: 0,firstname,lastname,title,organization,description,dgroup
1,Boris,Goldstein,CEO,Grapheneca,graphene nanomaterials – applications in energ...,health
3,Irfan,Ahmad,"Executive Director, Interdisciplinary Initiatives",Grainger College of Engineering,health innovation at the intersection of engin...,health
