In [140]:
import pandas as pd
import re
import os

In [147]:
raw_path = 'file.csv'
clean_path = "./../../data/clean/cleaned_news_unlabeled.txt"
splitted_path = "./../../data/clean/splitted/"

uncleaned_news=pd.read_csv("./../../data/news_unlabeled.csv")

In [156]:
def format_capital(text):
    """
    Function to convert upper case word (except abbrevation) in front of sentence into sentence case

    Args : Text that you want to convert (str)

    Return : Sentence Case (str)
    """
    return re.sub(
        r"^[A-Z]{5,}\s",
        lambda match: match.group(0).capitalize(),
        text,
        flags=re.MULTILINE,
    )


def split_sentences(text):
    """
    Function to split a record that contain more than one sentence

    Args : Text that you want to split (str)

    Returns : Splitted sentence (list)
    """
    return re.split(r'(?<=\.)\s+', text)

def preprocessing(data):
    """
    Function to preprocess raw text data for further analysis or model input.

    This function performs the following preprocessing steps:
        - Change word Dok. into "Dokumen"
        - Change word "Plt." into "Plt"
        - Replace the abbrevation yoy into "secara tahunan"
        - Remove "(year on year/yoy)"
        - Removes news lead phrases, such as location and source mentions like "Jakarta, kompas.com -".
        - Remove news lead phrase that contain only city like "BEIRUT -", "JAKARTA -", & etc.
        - Remove numbering in the first sentence
        - Remove unwanted leading characters like whitespace, slashes, or hyphens
        - Change word "persen" into symbol "%" (except num word in the beginning sentence)
        - Convert upper case word in the beginning of the sentence (another lead news) into sentence case 
        - Removes any unwanted starting characters like whitespace or slashes.
        - Removes certain types of sources or URLs (e.g., "sumber" followed by a URL).

    Args:
        data (pandas.DataFrame): A DataFrame containing raw text data, specifically in the 'sentence' column.

    Returns:
        data_cleaned (pandas.DataFrame): A cleaned DataFrame containing the preprocessed text from the 'sentence' column.
    """
    data_cleaned = data.copy()
    # Change word Dok. into "Dokumen"
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace('Dok.', 'Dokumen')
    # Change word "Plt." into "Plt"
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace('Plt.', 'Plt')
    # Remove "(year on year/yoy)"
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace('(year on year/yoy)', '')
    # Replace the abbrevation yoy into "secara tahunan"
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace('yoy', 'secara tahunan')
    # Remove news lead phrase that contain city and source mention
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'(?i)\b(jakarta\s+kompas\.com|kompas\.com\s*-|kompas\.com\s+|\bkompas\.com\b)\b', '', regex=True)
    # Remove news lead phrase that contain only city
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'^\w+\s\w+\,\s+\-|^\w+\,\s+\-|^[A-Za-z]+,?\s+–\s+', '', regex=True)
    # Remove numbering in the first sentence
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'^\d+\.\s*', '', regex=True)
    data_cleaned['sentence'] = data_cleaned['sentence'].str.lstrip('.')
    # Remove unwanted leading characters like whitespace, slashes, or hyphens
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'^[\s\-/\–]+', '', regex=True)
    # Change word "persen" into symbol "%"
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'(?<=\d)\s+(persen)', '%', regex=True)
    # Remove any sources or URLs following the word "sumber" or standalone URLs
    data_cleaned['sentence'] = data_cleaned['sentence'].str.replace(r'\bsumber\s+(https?:[^\s]+|www\.[^\s]+)|https?:[^\s]+|www\.[^\s]+', '', regex=True)
    # Convert upper case word in the beginning of the sentence into sentence case
    data_cleaned['sentence'] = data_cleaned['sentence'].apply(format_capital)
    # Split record that contain more than one sentences
    data_cleaned['sentence'] = data_cleaned['sentence'].apply(split_sentences)
    # .explode to place the splitted record into another row
    data_cleaned = data_cleaned.explode('sentence').reset_index(drop=True)

    return data_cleaned

In [157]:
cleaned_news = preprocessing(uncleaned_news)

In [158]:
cleaned_news['sentence'].iloc[1872]

'Pemerintah memperkirakan potensi perputaran uang dari transaksi judi daring (online) bisa mencapai Rp 700 triliun jika langkah intervensi tidak dilakukan.'

In [159]:
cleaned_news['sentence'] = cleaned_news['sentence'].fillna('')
cleaned_news['sentence'] = cleaned_news['sentence'][cleaned_news['sentence'].str.strip() != '']

In [160]:
os.makedirs(os.path.dirname(clean_path), exist_ok=True)
cleaned_news.to_csv(clean_path, index=True)
print(f"File successfully cleaned and saved in: {clean_path}")

File successfully cleaned and saved in: ./../../data/clean/cleaned_news_unlabeled.txt


In [148]:
# Menyimpan satu kolom ke file .txt menggunakan to_csv
cleaned_news[['sentence']].to_csv(clean_path, index=False, header=False)

## Split CSV for labelling

In [57]:
split_size = len(cleaned_news) // 4
names = ['kenzie', 'naufal', 'hafiz', 'satrio']
for i, name in enumerate(names):
    start_idx = i * split_size
    if i == len(names) - 1:  
        end_idx = len(cleaned_news)
    else:
        end_idx = (i + 1) * split_size
    
    split_data = cleaned_news.iloc[start_idx:end_idx]
    os.makedirs(os.path.dirname(splitted_path), exist_ok=True)
    split_file_path = os.path.join(os.path.dirname(splitted_path), f"unlabeled_{name}.csv")
    split_data.to_csv(split_file_path, index=False)
    print(f"File for {name} successfully saved in: {split_file_path}")

File for kenzie successfully saved in: ./../../data/clean/splitted\unlabeled_kenzie.csv
File for naufal successfully saved in: ./../../data/clean/splitted\unlabeled_naufal.csv
File for hafiz successfully saved in: ./../../data/clean/splitted\unlabeled_hafiz.csv
File for satrio successfully saved in: ./../../data/clean/splitted\unlabeled_satrio.csv
