# Read and process LinkedIn data


In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

<IPython.core.display.Javascript object>

## Import libraries


In [2]:
import personal_linkedin_eda.utils.paths as path
import personal_linkedin_eda.utils.preprocess as prep
import pandas as pd
import numpy as np
import nltk

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cvillafraz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<IPython.core.display.Javascript object>

## Load data and delete unnecessary columns


In [3]:
df_certifications = pd.read_csv(path.data_raw_dir("Certifications.csv"))
df_connections = pd.read_csv(path.data_raw_dir("Connections.csv"), header=2)
df_messages = pd.read_csv(path.data_raw_dir("messages.csv"))
df_queries = pd.read_csv(path.data_raw_dir("SearchQueries.csv"))

<IPython.core.display.Javascript object>

In [4]:
# Print all columns from each dataset
print(f"Certifications columns: {df_certifications.columns}\n")
print(f"Connections columns: {df_connections.columns}\n")
print(f"Messages columns: {df_messages.columns}\n")
print(f"Queries columns: {df_queries.columns}")

Certifications columns: Index(['Name', 'Url', 'Authority', 'Started On', 'Finished On',
       'License Number'],
      dtype='object')

Connections columns: Index(['First Name', 'Last Name', 'Email Address', 'Company', 'Position',
       'Connected On'],
      dtype='object')

Messages columns: Index(['CONVERSATION ID', 'CONVERSATION TITLE', 'FROM', 'SENDER PROFILE URL',
       'TO', 'DATE', 'SUBJECT', 'CONTENT', 'FOLDER'],
      dtype='object')

Queries columns: Index(['Time', 'Search Query'], dtype='object')


<IPython.core.display.Javascript object>

In [4]:
df_certifications.drop(columns=["Url", "License Number"], inplace=True)
df_connections.drop(columns=["First Name", "Last Name", "Email Address"], inplace=True)
df_messages.drop(
    columns=["SENDER PROFILE URL", "FOLDER", "CONVERSATION ID"], inplace=True
)

<IPython.core.display.Javascript object>

## Check the structure of the datasets


In [17]:
df_certifications.head()

Unnamed: 0,Name,Authority,Started On,Finished On
0,EF SET English Certificate Plus 73/100 (C2 Pro...,EF Standard English Test (EF SET),Nov 2017,
1,Digital Accessibility: Enabling Participation ...,University of Southampton,Feb 2018,
2,Digital Skills:Digital Marketing,Accenture,Feb 2018,
3,Google Analytics Individual Qualification,Google,May 2018,May 2019
4,Digital Skills: User Experience,Accenture,Sep 2018,


In [18]:
df_connections.head()

Unnamed: 0,Company,Position,Connected On
0,ACALConecta,Desarrollo en Ciencia de datos,29 Jul 2021
1,Vest,Financial Content Creator,15 Jul 2021
2,Servicios Alo 24.,Analista de seguros,26 Jun 2021
3,Instituto de la Juventud de la Ciudad de México,Brigadista en Memorias de Tenochtitlan,26 Jun 2021
4,KD Importaciones,Responsable de operaciones,26 Jun 2021


In [15]:
df_messages.head()

Unnamed: 0,CONVERSATION TITLE,FROM,TO,DATE,SUBJECT,CONTENT
0,Entrevista en EDteam,Thiara Vásquez,LinkedIn Member,2021-11-23 17:58:49 UTC,Entrevista en EDteam,"Hola Ciro, Ya hemos seleccionado a un candidat..."
1,Entrevista en EDteam,LinkedIn Member,Thiara Vásquez,2021-11-16 18:19:22 UTC,Entrevista en EDteam,"Hola Thiara, hay alguna actualización sobre el..."
2,Entrevista en EDteam,LinkedIn Member,Thiara Vásquez,2021-11-08 18:05:56 UTC,Entrevista en EDteam,Que así sea
3,Entrevista en EDteam,Thiara Vásquez,LinkedIn Member,2021-11-08 18:05:02 UTC,,Ahora lo agendo. Nos vemos mañana 💪🏽😁
4,Entrevista en EDteam,LinkedIn Member,Thiara Vásquez,2021-11-08 17:45:32 UTC,Entrevista en EDteam,Esta bien


In [9]:
df_queries.head()

Unnamed: 0,Time,Search Query
0,2020/05/18 02:53:37 UTC,The Soft Dev Team
1,2021/10/26 22:59:31 UTC,data analyst
2,2020/11/28 02:46:50 UTC,test automation
3,2021/10/22 22:36:02 UTC,data scientist
4,2020/11/28 02:47:02 UTC,test automation


## Remove unnecessary/null values


In [6]:
print(f"Certifications null values:\n {df_certifications.isna().sum()}\n")
print(f"Connections null values:\n {df_connections.isna().sum()}\n")
print(f"Messages null values:\n {df_messages.isna().sum()}\n")
print(f"Queries null values:\n {df_queries.isna().sum()}\n")

Certifications null values:
 Name            0
Authority       0
Started On      0
Finished On    27
dtype: int64

Connections null values:
 Company         4
Position        4
Connected On    0
dtype: int64

Messages null values:
 CONVERSATION TITLE    184
FROM                    3
TO                      3
DATE                    0
SUBJECT               184
CONTENT                 3
dtype: int64

Queries null values:
 Time            0
Search Query    0
dtype: int64



<IPython.core.display.Javascript object>

In [5]:
df_connections.dropna(inplace=True)
df_messages.dropna(inplace=True, subset=["CONTENT"])
### Remove promo messages from LinkedIn itself
df_messages = df_messages[
    ~df_messages["FROM"].str.contains("from linkedin|linkedin premium", case=False)
]

<IPython.core.display.Javascript object>

## Text preprocessing


### Normalize common position names


In [6]:
position_patterns = [
    (
        df_connections["Position"].str.contains("Full Stack|Web Developer", case=False),
        "Full Stack Developer",
    ),
    (
        df_connections["Position"].str.contains(
            "Frontend|Front End|Front-end", case=False
        ),
        "Front-end Developer",
    ),
    (
        df_connections["Position"].str.contains(
            "Backend|Back-end|Software", case=False
        ),
        "Software Engineer",
    ),
    (
        df_connections["Position"].str.contains(
            "Ciencia de datos|Data Scientist", case=False
        ),
        "Data Scientist",
    ),
    (
        df_connections["Position"].str.contains(
            "CEO|Chief Executive Officer|Business Owner", case=False
        ),
        "CEO",
    ),
    (
        df_connections["Position"].str.contains("Founder", regex=False, case=False),
        "Founder",
    ),
]

<IPython.core.display.Javascript object>

In [7]:
position_criteria, position_values = zip(*position_patterns)
df_connections["Position Normalized"] = np.select(
    position_criteria, position_values, None
)
# Replace "None" values with original position
df_connections["Position Normalized"] = df_connections[
    "Position Normalized"
].combine_first(df_connections["Position"])

<IPython.core.display.Javascript object>

### Remove stopwords and normalize messages and search queries


In [13]:
remove_stopwords = prep.remove_stopwords
normalize_text = prep.normalize_text
df_messages["CONTENT"] = df_messages["CONTENT"].apply(
    lambda str: remove_stopwords(normalize_text(str))
)
df_messages["SUBJECT"] = df_messages["SUBJECT"].apply(
    lambda string: remove_stopwords(normalize_text(string)) if isinstance(string, str) else None
)
df_queries["Search Query"] = df_queries["Search Query"].apply(
    lambda str: normalize_text(str)
)

<IPython.core.display.Javascript object>

## Save data


In [14]:
df_certifications.to_csv(path.data_processed_dir("certifications_clean.csv"))
df_connections.to_csv(path.data_processed_dir("connections_clean.csv"))
df_messages.to_csv(path.data_processed_dir("messages_clean.csv"))
df_queries.to_csv(path.data_processed_dir("queries_clean.csv"))

<IPython.core.display.Javascript object>