In [6]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import torch

import warnings
warnings.filterwarnings("ignore")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove special characters and emojis
    text = re.sub(r'[^a-zA-Z0-9\u4e00-\u9fff\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [3]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

In [4]:
def pipeline(data):
    # Clean the text
    data['cleaned_text'] = data['text'].astype(str).apply(clean_text)

    # Tokenize the text
    data['bert_embedding'] = data['cleaned_text'].apply(get_bert_embedding)
    return data

In [1]:
# Get all the file in the data folder
from pathlib import Path

# Specify the directory path
input_path = Path('facebook_data\new')
output_path = Path('facebook_data\label_comment')

# List all files in the directory
for file_path in input_path.glob('*'):
        print(f'Processing {file_path.name}')
        data = pd.read_csv(file_path)
        data = pipeline(data)
        data.to_csv(output_path / file_path.name, index=False)

In [7]:
data = pd.read_csv(r"facebook_data\new\吳沛憶.csv")
data = pipeline(data)
data.to_csv(r'facebook_data\label_comment\吳沛憶留言.csv', index=False)