In [None]:
import pandas as pd
import re

# Function to safely read TSV files and extract words (assuming no header for these input files)
def load_sentiment_words(file_path):
    words = set()
    try:
        # Check if file exists
        with open(file_path, 'r', encoding='utf-8') as f_check:
            pass
        # Read TSV assuming no header for word extraction
        df = pd.read_csv(file_path, sep='\t', header=None, on_bad_lines='skip', quotechar=None, quoting=3)
        for index, row in df.iterrows():
            if pd.notna(row[0]):
                 # Check if the first item could be a header from user's example,
                 # though for word list extraction, we treat all as potential words.
                 # For this function, we assume the files positive.tsv and negative.tsv
                 # are just lists of words, possibly with scores, but we only need words.
                 # The user's instruction for output netral.tsv is to have a header.
                 word_candidate = str(row[0]).lower()
                 # A simple heuristic to avoid common header names being added as sentiment words
                 if word_candidate not in ["word", "term"]:
                    words.add(word_candidate)
    except FileNotFoundError:
        print(f"Peringatan: File tidak ditemukan: {file_path}. Daftar kata akan kosong.")
    except Exception as e:
        print(f"Error membaca {file_path}: {e}")
    return words

# Function to tokenize text
def tokenize(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single, and strip
    tokens = text.split()
    return [word for word in tokens if len(word) > 1] # Keep words with more than 1 character


# File paths
path_pilgub_csv = "PilgubJatim1_no_hashtags.csv"
path_positive_tsv = "positive.tsv"
path_negative_tsv = "negative.tsv"
output_file_name = "netral_with_header.tsv" # New name to avoid confusion if needed, or keep as netral.tsv

# Load the main CSV file
try:
    main_df = pd.read_csv(path_pilgub_csv)
except Exception as e:
    print(f"Error membaca {path_pilgub_csv}: {e}")
    main_df = pd.DataFrame()

# Load positive and negative words
positive_words = load_sentiment_words(path_positive_tsv)
negative_words = load_sentiment_words(path_negative_tsv)

print(f"Jumlah kata positif dimuat: {len(positive_words)}")
print(f"Jumlah kata negatif dimuat: {len(negative_words)}")


all_words_from_csv = set()
if 'full_text_no_hashtags' in main_df.columns:
    for index, row in main_df.iterrows():
        text_content = row['full_text_no_hashtags']
        if pd.notna(text_content):
            tokens = tokenize(text_content)
            for token in tokens:
                all_words_from_csv.add(token)
else:
    print(f"Kolom 'full_text_no_hashtags' tidak ditemukan di {path_pilgub_csv}")

print(f"Jumlah kata unik dari CSV (setelah tokenisasi): {len(all_words_from_csv)}")

# Identify neutral words
neutral_words_list = []
for word in all_words_from_csv:
    if word not in positive_words and word not in negative_words:
        neutral_words_list.append(word)

neutral_words_list.sort() # Sort for consistent output

# Create the neutral.tsv file with header
try:
    with open(output_file_name, 'w', encoding='utf-8') as f:
        f.write("word\tweight\n") # Write the header
        for word_item in neutral_words_list:
            f.write(f"{word_item}\t0\n") # Write word and its neutral weight
    print(f"File '{output_file_name}' telah berhasil dibuat dengan header dan {len(neutral_words_list)} kata netral.")
    if len(neutral_words_list) > 0:
        print(f"\nContoh isi file '{output_file_name}':")
        print("word\tweight") # Show header in example
        for i in range(min(5, len(neutral_words_list))):
            print(f"{neutral_words_list[i]}\t0")
        if len(neutral_words_list) > 5:
            print("...")
    else:
        print("Tidak ada kata netral yang ditemukan berdasarkan kriteria.")

except Exception as e:
    print(f"Error menulis {output_file_name}: {e}")

Jumlah kata positif dimuat: 3608
Jumlah kata negatif dimuat: 6607
Jumlah kata unik dari CSV (setelah tokenisasi): 4852
File 'netral.tsv' telah berhasil dibuat ulang dengan 3751 kata netral.

Contoh isi file 'netral.tsv':
aang	0
abang	0
abdul	0
abdullah	0
abdurrahman	0
...
