In [4]:
from nltk import sent_tokenize
import pandas as pd
import os
import re
import csv

In [5]:
raw_path = 'file.csv'
clean_path = "./../../data/clean/cleaned_news_unlabeled.csv"
splitted_path = "./../../data/clean/splitted/"

uncleaned_data=pd.read_csv("./../../data/news_unlabeled.csv")

In [6]:
def preprocessing(uncleaned_data):
    """
    Function to preprocess raw text data for further analysis or model input.

    This function performs the following preprocessing steps:
        - Removes unwanted characters such as punctuation, leaving only words and certain symbols.
        - Removes news lead phrases, such as location and source mentions like "Jakarta, kompas.com -".
        - Removes any unwanted starting characters like whitespace or slashes.
        - Removes certain types of sources or URLs (e.g., "sumber" followed by a URL).

    Args:
        uncleaned_data (pandas.DataFrame): A DataFrame containing raw text data, specifically in the 'sentence' column.

    Returns:
        pandas.Series: A cleaned Series containing the preprocessed text from the 'sentence' column.
    """
    data_cleaned = uncleaned_data.copy()
    # Remove unwanted characters, keeping only words, spaces, and certain symbols (+ / . - :)
    data_cleaned = data_cleaned['sentence'].str.replace(r'[^\w\s\+\/\.\-\:]', '', regex=True)
    # Remove news lead phrase
    data_cleaned = data_cleaned.str.replace(r'(?i)\b(jakarta\s+kompas\.com|kompas\.com\s*-|kompas\.com\s+|\bkompas\.com\b)\b', '', regex=True)
    # Remove unwanted leading characters like whitespace, slashes, or hyphens
    data_cleaned = data_cleaned.str.replace(r'^[\s\-/]+', '', regex=True)
    # Remove any sources or URLs following the word "sumber" or standalone URLs
    data_cleaned = data_cleaned.str.replace(r'\bsumber\s+(https?:[^\s]+|www\.[^\s]+)|https?:[^\s]+|www\.[^\s]+', '', regex=True)

    return data_cleaned

In [7]:
data_cleaned = preprocessing(uncleaned_data)

In [117]:
data_cleaned = data_cleaned.fillna('')
data_cleaned = data_cleaned[data_cleaned.str.strip() != '']

In [118]:
os.makedirs(os.path.dirname(clean_path), exist_ok=True)
data_cleaned.to_csv(clean_path, index=False)
print(f"File berhasil dibersihkan dan disimpan di: {clean_path}")

File berhasil dibersihkan dan disimpan di: ./../../data/clean/cleaned_news_unlabeled.csv


## Split CSV for labelling

In [119]:
split_size = len(data_cleaned) // 4
names = ['kenzie', 'naufal', 'hafiz', 'satrio']
for i, name in enumerate(names):
    start_idx = i * split_size
    if i == len(names) - 1:  
        end_idx = len(data_cleaned)
    else:
        end_idx = (i + 1) * split_size
    
    split_data = data_cleaned.iloc[start_idx:end_idx]
    os.makedirs(os.path.dirname(splitted_path), exist_ok=True)
    split_file_path = os.path.join(os.path.dirname(splitted_path), f"unlabeled_{name}.csv")
    split_data.to_csv(split_file_path, index=False)
    print(f"File untuk {name} berhasil disimpan di: {split_file_path}")

File untuk kenzie berhasil disimpan di: ./../../data/clean/splitted\unlabeled_kenzie.csv
File untuk naufal berhasil disimpan di: ./../../data/clean/splitted\unlabeled_naufal.csv
File untuk hafiz berhasil disimpan di: ./../../data/clean/splitted\unlabeled_hafiz.csv
File untuk satrio berhasil disimpan di: ./../../data/clean/splitted\unlabeled_satrio.csv
