# Extracting data from the data engineering networking WhatsApp group

In [17]:
import pandas as pd
import re
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter

# Taking a txt file and transforming it into a dataframe

In [18]:
path = "D:/engenharia de dados/files/whats/networking.txt"


with open(path, "r", encoding="utf-8") as f:
    content = f.readlines()

default = r"^(?:\d{1,2}\/\d{1,2}\/\d{4}) (\d{2}:\d{2}) - (?:(.*?): )?(.*)$"

data = []
current_message = None

for line in content:
    match = re.match(default, line)
    if match:
        hour, sender, message = match.groups()
        current_message = [hour, sender if sender else "SISTEMA", message]
        data.append(current_message)
    else:
        if current_message:
            current_message[2] += "\n" + line 

df = pd.DataFrame(data, columns=["Hora", "Remetente", "Mensagem"])

print(df.head(10))


    Hora Remetente                                           Mensagem
0  10:09   SISTEMA  As mensagens e ligações são protegidas com a c...
1  19:08   SISTEMA  ‎~ Fábio criou o grupo "Networking Jornada de ...
2  19:08   SISTEMA                             Você foi adicionado(a)
3  19:08   SISTEMA  ‎~ Fábio criou o grupo "Networking Jornada de ...
4  10:09   SISTEMA  ‎+55 61 9404-6249 entrou usando o link de conv...
5  10:15   SISTEMA  ‎+55 51 9387-1111 entrou usando o link de conv...
6  10:22   SISTEMA  ‎+55 79 9670-7393 entrou usando o link de conv...
7  10:22   SISTEMA  ‎+55 61 8265-8022 entrou usando o link de conv...
8  10:24   SISTEMA  ‎+55 62 9932-6482 entrou usando o link de conv...
9  10:26   SISTEMA  ‎+55 42 8824-1767 entrou usando o link de conv...


# Filtering only messages that contain LinkedIn and transforming the data only into LinkedIn links

In [19]:
df_linkedin = df[df["Mensagem"].str.contains(r"linkedin\.com", case=False, na=False)].copy()

df_linkedin["Mensagem"] = df_linkedin["Mensagem"].str.extract(
    r"((?:https?://)?(?:www\.)?linkedin\.com[^\s]+)",
    expand=False
)

df_linkedin["Mensagem"] = df_linkedin["Mensagem"].apply(
    lambda x: "https://" + x if pd.notnull(x) and not x.startswith("http") else x
)

df_linkedin.rename(columns={"Mensagem": "Linkedin"}, inplace=True)

print(df_linkedin[["Hora", "Remetente", "Linkedin"]])


      Hora          Remetente  \
168  12:00   +55 71 9308-2437   
169  12:00   +55 93 9138-9543   
170  12:00   +55 31 9219-4008   
176  12:01   +55 81 9998-2162   
178  12:01  +55 11 97020-9285   
..     ...                ...   
611  14:04   +55 85 8808-3993   
613  14:04   +55 48 9925-8353   
614  14:05   +55 82 9933-5023   
616  14:06   +55 88 9704-3295   
617  14:06  +55 27 99938-2967   

                                              Linkedin  
168        https://www.linkedin.com/in/iasmim-horrana/  
169            https://www.linkedin.com/in/max-mitsuya  
170  https://www.linkedin.com/in/jo%C3%A3o-victor-1...  
176  https://www.linkedin.com/in/maria-eduarda-nasc...  
178  https://www.linkedin.com/in/ygor-amaro-114613231/  
..                                                 ...  
611  https://www.linkedin.com/in/israel-chaves-a321...  
613  https://www.linkedin.com/in/valter-perez-50283...  
614  https://www.linkedin.com/in/lethicia-lima-0904...  
616   https://www.linkedin.com/in

# Generating the Excel file, adjusting the column widths, and turning the LinkedIn column into a hyperlink

In [20]:
path_excel = r"D:\\engenharia de dados\\files\\whats\\networking.xlsx"

df_linkedin.to_excel(path_excel, index=False)

wb = load_workbook(path_excel)
ws = wb.active

for row in range(2, ws.max_row + 1):
    cell = ws[f"C{row}"]
    url = cell.value
    if url:
        cell.hyperlink = url
        cell.value = "Linkedin"
        cell.style = "Hyperlink"

for col_cells in ws.columns:
    max_len = 0
    col_letter = get_column_letter(col_cells[0].column)
    for c in col_cells:
        if c.value is not None:
            max_len = max(max_len, len(str(c.value)))
    ws.column_dimensions[col_letter].width = max(10, max_len + 2)

wb.save(path_excel)
