# Wrangling ViableEDU's Linkedin data

* Data cleaning and agregation
* Generate features
* Generate insights


In [1]:
# Libraries
import time
import numpy as np
import pandas as pd
import re
import math as mt
from datetime import datetime
import pickle

In [2]:
# Load all the datasets
followersDF = pd.read_csv('Output/followersList.csv')
postsInfo = pd.read_csv('Output/postsInfo.csv')
likesInfo = pd.read_csv('Output/likesInfo.csv')
peoplePKL = pd.read_pickle('Output/profiles.pkl')

In [3]:
# Create the dataframe with personal information
peopleDF = pd.DataFrame(columns=['name','jobTitle','company','school','location','connections','summary'])
for person in peoplePKL.keys():
    x = pd.DataFrame.from_dict(peoplePKL[person]['personalInfo'],orient='index').T
    x.loc[0,'name'] = person 
    peopleDF = peopleDF.append(x)
peopleDF.reset_index(drop=True,inplace=True)
peopleDF.head()

Unnamed: 0,name,jobTitle,company,school,location,connections,summary
0,paul gover,"Head of Financial Markets, Europe & Americas a...",Westpac,ICMA Centre,"London, England, United Kingdom",500+ connections,
1,David Holden,Student at The University of Michigan,,University of Michigan,"Atlanta, Georgia, United States",227 connections,
2,"Larkin Ison, Jr.",Business Units Techical Training Manager,Westinghouse Electric Company,,Huntsville-Decatur-Albertville Area,500+ connections,Technical leader with diverse and extensive ex...
3,Brian Lane,Founder & CEO at FixtHub,FixtHub,Lehigh University,New York City Metropolitan Area,500+ connections,
4,Timothy Oberweger,"Vice President, Sr. Business Development Offic...",Stewart Title,Brooklyn Law School,"Fairfield, Connecticut, United States",500+ connections,"In my role as Vice President, Senior Business ..."


### Job features creation

In [4]:
# Helpers
def yearsExp(job):
    try:
        i = job.index('Employment Duration')
        if 'Student Athlete' not in job[0]:
            return(job[i+1])
    except:
        return(np.NAN)

def totalExp(yExp):
    exp = 0
    try:
        for i in yExp:
            try:
                exp += int(re.findall(r"(\d+) yr", i)[0])*12
                exp += int(re.findall(r"(\d+) mo", i)[0])
            except:
                pass
        return(exp/12)
    except:
        return(np.NaN)
    
def lastJob(job):
    try:
        i = job.index('Title')
        if 'Student Athlete' not in job[0]:
            return(job[i+1])
    except:
        return('')

def activeJob(job):
    try:
        i = job.index('Dates Employed')
        if 'Student Athlete' not in job[0]:
            return('Present' in job[i+1])
    except:
        return(False)

def lastCompany(job):
    try:
        i = job.index('Company Name')
        if 'Student Athlete' not in job[0]:
            return(job[i+1])
    except:
        return('')

In [5]:
# Years of work experience
for person in peoplePKL.keys():
    try:
        x = totalExp([yearsExp(job) for job in peoplePKL[person]['jobsInfo'].values()])
        peopleDF.loc[peopleDF.name == person,'yearsExp'] = x
    except:
        peopleDF.loc[peopleDF.name == person,'yearsExp'] = 0

In [6]:
# Last job title
for person in peoplePKL.keys():
    try:
        peopleDF.loc[peopleDF.name == person,'currentJobTitle'] = lastJob(peoplePKL[person]['jobsInfo']['job0']) 
        peopleDF.loc[peopleDF.name == person,'activeJob'] = activeJob(peoplePKL[person]['jobsInfo']['job0'])
    except:
        peopleDF.loc[peopleDF.name == person,'currentJobTitle'] = ''
        peopleDF.loc[peopleDF.name == person,'activeJob'] = ''

In [7]:
# How many different jobs do they have, Last employer
for person in peoplePKL.keys():
    try:
        peopleDF.loc[peopleDF.name == person,'numberJobs'] = len(peoplePKL[person]['jobsInfo']) 
        peopleDF.loc[peopleDF.name == person,'lastEmployer'] = lastCompany(peoplePKL[person]['jobsInfo']['job0'])
    except:
        peopleDF.loc[peopleDF.name == person,'numberJobs'] = 0
        peopleDF.loc[peopleDF.name == person,'lastEmployer'] = ''

In [8]:
peopleDF.head()

Unnamed: 0,name,jobTitle,company,school,location,connections,summary,yearsExp,currentJobTitle,activeJob,numberJobs,lastEmployer
0,paul gover,"Head of Financial Markets, Europe & Americas a...",Westpac,ICMA Centre,"London, England, United Kingdom",500+ connections,,20.666667,"Head of Financial Markets, Europe & Americas",True,3.0,Westpac
1,David Holden,Student at The University of Michigan,,University of Michigan,"Atlanta, Georgia, United States",227 connections,,0.0,,,0.0,
2,"Larkin Ison, Jr.",Business Units Techical Training Manager,Westinghouse Electric Company,,Huntsville-Decatur-Albertville Area,500+ connections,Technical leader with diverse and extensive ex...,7.083333,Business Units Techical Training Manager,True,6.0,Westinghouse Electric Company
3,Brian Lane,Founder & CEO at FixtHub,FixtHub,Lehigh University,New York City Metropolitan Area,500+ connections,,18.5,,True,5.0,FixtHub
4,Timothy Oberweger,"Vice President, Sr. Business Development Offic...",Stewart Title,Brooklyn Law School,"Fairfield, Connecticut, United States",500+ connections,"In my role as Vice President, Senior Business ...",15.333333,,True,5.0,Stewart Title


### Education and skills features creation

In [9]:
# Helpers
def highestDegree(eduInfo):
    flat = [item for sublist in eduInfo for item in sublist]
    if any(['phd | PHD | Ph' in i for i in flat]):
        return('PhD')
    if any(['Master' in i for i in flat]):
        return('Master')
    elif any(['JD' in i for i in flat]):
        return('JD')
    else:
        return('Bachelor')

In [10]:
def fields(eduInfo):
    try:
        j = eduInfo.index('Field Of Study') + 1
        if 'MOOC' not in eduInfo[j]:
            return(eduInfo[j])
    except:
        ''    

In [11]:
for person in peoplePKL.keys():
    try:
        peopleDF.loc[peopleDF.name == person,'topSkills'] = ','.join(peoplePKL[person]['skillsInfo'])
        peopleDF.loc[peopleDF.name == person,'highestDegree'] = highestDegree(peoplePKL[person]['eduInfo'])
    except:
        peopleDF.loc[peopleDF.name == person,'topSkills'] = ''
        peopleDF.loc[peopleDF.name == person,'highestDegree'] = ''

In [12]:
for person in peoplePKL.keys():
    try:
        peopleDF.loc[peopleDF.name == person,'fieldsStudy'] = ','.join([fields(section) for section in peoplePKL[person]['eduInfo'] if fields(section) is not None])
    except:
        peopleDF.loc[peopleDF.name == person,'fieldsStudy'] = ''

In [13]:
peopleDF.head()

Unnamed: 0,name,jobTitle,company,school,location,connections,summary,yearsExp,currentJobTitle,activeJob,numberJobs,lastEmployer,topSkills,highestDegree,fieldsStudy
0,paul gover,"Head of Financial Markets, Europe & Americas a...",Westpac,ICMA Centre,"London, England, United Kingdom",500+ connections,,20.666667,"Head of Financial Markets, Europe & Americas",True,3.0,Westpac,"Capital Markets,Trading,Financial Markets",Master,"International Securities, Investment Banking,A..."
1,David Holden,Student at The University of Michigan,,University of Michigan,"Atlanta, Georgia, United States",227 connections,,0.0,,,0.0,,"Mentoring,Microsoft Office,Social Media",Bachelor,"Industrial Engineering,Applied Physics"
2,"Larkin Ison, Jr.",Business Units Techical Training Manager,Westinghouse Electric Company,,Huntsville-Decatur-Albertville Area,500+ connections,Technical leader with diverse and extensive ex...,7.083333,Business Units Techical Training Manager,True,6.0,Westinghouse Electric Company,"Program Management,Business Process Improvemen...",Bachelor,"Business - Management of Technology,Electrical..."
3,Brian Lane,Founder & CEO at FixtHub,FixtHub,Lehigh University,New York City Metropolitan Area,500+ connections,,18.5,,True,5.0,FixtHub,"Fixed Income,Trading,Capital Markets",Bachelor,Business and Economics
4,Timothy Oberweger,"Vice President, Sr. Business Development Offic...",Stewart Title,Brooklyn Law School,"Fairfield, Connecticut, United States",500+ connections,"In my role as Vice President, Senior Business ...",15.333333,,True,5.0,Stewart Title,"Real Estate,Real Estate Transactions,Commercia...",JD,"Law,Political Science, Communications, & Relig..."


In [14]:
peopleDF.to_csv('Output/profiles.csv',index = False, encoding='utf-8')