# LIBRARY

In [None]:
pip install pandas

In [None]:
pip install openpyxl

In [None]:
pip install openai

In [None]:
pip install python_dotenv

# LOAD DATA

In [1]:
import pandas as pd

In [11]:
folder_path = 'Data/Data Sentimen/Raw Data/Validation'

In [None]:
import os

def read_csv_files(folder_path):
    dfs = []

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)

            df = pd.read_csv(file_path, on_bad_lines="skip")
            dfs.append(df)

    combined_df = pd.concat(dfs, ignore_index=True)

    return combined_df

sentimen = read_csv_files(folder_path)

print(sentimen)

In [None]:
df = sentimen
df.tail()

In [None]:
len(df)

# DATA PREPROCESSING

In [15]:
df = df.drop_duplicates()

In [16]:
import re
```
def clean_tweet_text(raw_text):
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', raw_text)
    cleaned_text = re.sub(r'https?:\/\/\S+', '', cleaned_text)
    cleaned_text = re.sub(r',+$', '', cleaned_text)
    cleaned_text = re.sub(r'@\w+', '', cleaned_text)
    return cleaned_text.strip()

df['full_text'] = df['full_text'].apply(clean_tweet_text)

In [None]:
df['full_text']

In [None]:
print(f"{df['full_text'][2]}\n{df['full_text'][3]}")

In [19]:
data_clean = pd.DataFrame({'date': df['created_at'], 'tweet': df['full_text'], 'labels': None})

In [None]:
data_clean.head()

In [None]:
data_clean.dtypes

In [22]:
data_clean['tweet'] = data_clean['tweet'].apply(clean_tweet_text)

In [None]:
data_clean.head()

In [24]:
data_clean.to_excel('Data/Data Sentimen/Clean Data/data_clean-validation.xlsx', index=False)

In [25]:
import pandas as pd
df = pd.read_excel('Data/Data Sentimen/Clean Data/data_clean-validation.xlsx')
df.drop(index=3, inplace=True)

In [None]:
df.sample(5)

In [None]:
len(df)

## LABELING

In [None]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd

class AutoLabeling:
    def __init__(self) -> None:
        self.client = AzureOpenAI(
            api_key=os.getenv("AZURE_OPENAI_API_KEY"),
            api_version="2024-02-01",
            azure_endpoint=os.getenv("API_BASE")
        )
        self.deployment_name = os.getenv("DEPLOYMENT_NAME")

    def request_generate(self, prompt):
        try:
            response = self.client.chat.completions.create(
                model=self.deployment_name,
                messages=prompt,
                temperature=0.4,
                max_tokens=2000
            )
            text = response.choices[0].message.content.replace(' .', '.').strip()
            return text
        except Exception as e:
            print("An error occurred:", e)
            return None

    def process_tweets(self, dates, tweets):
        df = pd.DataFrame({'date': dates, 'tweet': tweets})
        df['labels'] = None  # Initialize the labels column

        for i, tweet in tqdm(enumerate(df['tweet']), total=len(df), desc="Processing tweets"):
            system_message = {
                "role": "system",
                "content": ("""Anda adalah seorang manusia biasa yang ahli dalam bahasa, tugas anda adalah memberikan label dari kalimat twitter dengan kategori 'positif' 'negatif' atau 'netral'
                pastikan hanya memberikan label dari kalimatnya saja, Selalu pastikan untuk memberi 1 label untuk setiap text yang saya inputkan.
                Gunakan huruf kapital hanya di awal nama label""")
            }

            user_message_content = f"Berikut adalah teks yang harus kamu labeli\nText: {tweet}\nLabel:"
            user_message = {'role': 'user', 'content': user_message_content}
            prompt_message = [system_message, user_message]

            label = self.request_generate(prompt_message)
            if label is not None:
                df.at[i, 'labels'] = label
            else:
                df.at[i, 'labels'] = "Error"

        return df

load_dotenv()

auto_labeling = AutoLabeling()
processed_df = auto_labeling.process_tweets(df['date'], df['tweet'])

In [None]:
df = processed_df.copy()
df.head()

In [None]:
len(df)

In [32]:
allowed_labels = ['Positif', 'Negatif', 'Netral']

filtered_df = df[df['labels'].isin(allowed_labels)]

In [None]:
filtered_df.nunique()

In [34]:
filtered_df.to_excel('Data/Data Sentimen/Labeled Data/labelled-validation.xlsx', index=False)