# Scrapping LinkedIn profiles

In [430]:
import os
import requests
import re
import pandas as pd
from requests.exceptions import RequestException
import json
from pandas import json_normalize
import datetime

# Fuction that scrapes LinkedIn profiles

In [415]:
def scrape_linkedin_profile(linkedin_profile_url: str):
    
    api_key = "2ZjHkun0Sq6KGa_o-4p7jg"
    header_dic = {"Authorization": f"Bearer {api_key}"}
    api_endpoint = "https://nubela.co/proxycurl/api/v2/linkedin"

    response = requests.get(
        api_endpoint, params={"url": linkedin_profile_url}, headers=header_dic
    )

    data = response.json()
    data = {
        k: v
        for k, v in data.items()
        if v not in ([], "", "", None)
        and k
        not in [
            "people_also_viewed",
            "recommendations",
            "similarly_named_profiles",
            "articles",
            "background_cover_image_url",
            "activities",
            "volunteer_work",
        ]
    }
    if data.get("groups"):
        for group_dict in data.get("groups"):
            group_dict.pop("profile_pic_url")

    return data

# This function manually scrapes a LinkedIn profile, taking the person's URL as argument

In [418]:
def scrape_linkedin_profile2(linkedin_profile_url: str):
    api_key = "2ZjHkun0Sq6KGa_o-4p7jg"
    header_dic = {"Authorization": f"Bearer {api_key}"}
    api_endpoint = "https://nubela.co/proxycurl/api/v2/linkedin"

    response = requests.get(
        api_endpoint, params={"url": linkedin_profile_url}, headers=header_dic
    )

    try:
        data = response.json()
        data = {
            k: v
            for k, v in data.items()
            if v not in ([], "", "", None)
            and k
            not in [
                "people_also_viewed",
                "recommendations",
                "similarly_named_profiles",
                "articles",
                "background_cover_image_url",
                "activities",
                "volunteer_work",
            ]
        }
        if data.get("groups"):
            for group_dict in data.get("groups"):
                group_dict.pop("profile_pic_url")

        return data

    except json.JSONDecodeError:

        return None



# Scraping and cleaning our dataframes

## First dataframe: 2015-2016 class

In [514]:
df1 = pd.read_csv('/Users/davidfernandez/Desktop/clean/scrapped/2015-2016.csv') 

df1 = df1[df1['Link perfil'] != 'Link no existente']

df1 = df1.reset_index()

df1 = df1.drop('index', axis=1)

'''We open our CSV, containing the full name and URL of the profile of our students, and we remove all those
rows that don't contain the URL of the student, because our Selenium code couldn't scrap it'''

"We open our CSV, containing the full name and URL of the profile of our students, and we remove all those\nrows that don't contain the URL of the student, because our Selenium code couldn't scrap it"

In [515]:
df1["Datos escrapeados"] = df1["Link perfil"].apply(scrape_linkedin_profile2)

# We apply our function to the row of our dataframe that contains all the URLs of our students


In [517]:
valor_a_buscar = {'code': 404, 'description': 'Person profile does not exist', 'name': 'Not Found'}


filas_filtradas = df1.loc[df1['Datos escrapeados'] == valor_a_buscar]

# We identify those students whose data we couldn't scrap

indices_a_eliminar = filas_filtradas.index

df1 = df1.drop(indices_a_eliminar)

# And we take them out 


In [518]:
df1 = df1.reset_index()

df1 = df1.drop('index', axis=1)

# We reset indexes and delete the column 'index'

In [519]:
normalized_data = pd.json_normalize(df1['Datos escrapeados']) 

# We create one column per key in our 'Datos escrapeados' column


In [520]:
df1 = pd.concat([df1, normalized_data], axis=1) # We apply those columns to our copi2 dataframe


In [522]:
columnas_a_eliminar = ['Nombre', 'Primer apellido', 'Segundo apellido', 'Email universitario', 'Email personal', 'Datos escrapeados', 'public_identifier', 'profile_pic_url', 'first_name', 'last_name', 'full_name', 'headline', 'country', 'languages', 'education', 'occupation', 'connections', 'country_full_name', 'follower_count', 'summary', 'state', 'accomplishment_honors_awards', 'accomplishment_courses', 'accomplishment_projects', 'groups', 'accomplishment_publications', 'certifications', 'accomplishment_organisations', 'accomplishment_test_scores']


df1 = df1.drop(columns=columnas_a_eliminar)


In [524]:
df1 = df1.explode('experiences')

# We explode our lists in 'experiences' column, to have as many rows per student as experiences she/he has

In [525]:
df1 = df1.reset_index()

df1 = df1.drop('index', axis=1)

# We reset our indexes, to make it more clear

In [527]:
normalized_data1 = pd.json_normalize(df1['experiences']) 


df1 = pd.concat([df1, normalized_data1], axis=1) # We apply those columns to our df1 dataframe


In [529]:
cols_a_eliminar = ['description', 'logo_url', 'ends_at', 'starts_at', 'company_linkedin_profile_url', 'experiences']                  
                   
df1 = df1.drop(cols_a_eliminar, axis=1)


# We get rid of unwanted columns

In [553]:
df2 = df1.copy()


In [554]:
df2['ends_at.day'].fillna('actualidad', inplace=True)
df2['ends_at.month'].fillna('actualidad', inplace=True)
df2['ends_at.year'].fillna('actualidad', inplace=True)

# If any of our ending columns have nulls, it means that that person is still working there

In [555]:
df2.dropna(subset=['starts_at.day', 'starts_at.month', 'starts_at.year'], inplace=True)

# We get rid of all our experiences whose start date is null

In [557]:

# Combines the columns of start date and end date into 'end date'
df2['Fecha inicio'] = df2.apply(lambda x: pd.to_datetime(f"{int(x['starts_at.year'])}-{int(x['starts_at.month'])}-01"), axis=1)
df2['Fecha fin'] = df2.apply(lambda x: 'actualidad' if x['ends_at.year'] == 'actualidad' else pd.to_datetime(f"{int(x['ends_at.year'])}-{int(x['ends_at.month'])}-01"), axis=1)

# Calculates the duration of each experience in months
df2['Duración (meses)'] = df2.apply(lambda x: (datetime.datetime.now() - x['Fecha inicio']).days // 30 if x['ends_at.year'] == 'actualidad' else (x['Fecha fin'] - x['Fecha inicio']).days // 30, axis=1)

# Replaces values in end date when its pertinent
df2['Fecha fin'] = df2.apply(lambda x: 'actualidad' if x['ends_at.year'] == 'actualidad' else x['Fecha fin'], axis=1)

# Gets rid of unnecesary columns
df2.drop(['location', 'starts_at.year', 'starts_at.day', 'starts_at.month', 'ends_at.day', 'ends_at.month', 'ends_at.year'], axis=1, inplace=True)


In [562]:
df2.rename(columns={'company': 'Empresa', 'title': 'Puesto', 'city': 'Ciudad'}, inplace=True)

# We rename some columns in spanish

In [559]:
df2 = df2.reset_index()

df2 = df2.drop('index', axis=1)

In [565]:
new_order = ['Grado', 'Nombre completo', 'Link perfil', 'Puesto', 'Empresa', 'Ciudad', 'Fecha inicio', 'Fecha fin', 'Duración (meses)']

df2 = df2[new_order]

# We finally change the orders of some columns to make them more comprehensible


In [568]:
df2.to_csv('/Users/davidfernandez/Desktop/clean/experience/2015-2016.csv', index=False) 


## Second dataframe: 2016-2017 class