# Sentiment Analysis

Melakukan analisis sentimen pada data menggunakan model VADER. Data yang digunakan adalah data berita yang sudah di translate yang disimpan dalam folder `data/translate.txt`. Hasil sentimen skornya kemudian disimpan `data/data_vader.json`

In [6]:
import pandas as pd
import re
import unicodedata
import json


import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Define Helper

In [3]:
def print_random(df, category_sentiment, column):
    data = df[df.category_sentiment == category_sentiment].sample(1)
    print("Sentiment Score :", data[column].to_list()[0])
    print("Link Article :", data.link.to_list()[0])
    print(data.content.to_list()[0])
    
def category_sentiment(compound, threshold_neg=-0.05, threshold_pos=0.05):
    if compound is None : return  None
    if compound < threshold_neg : return "Negative"
    if compound > threshold_pos : return "Positive"
    return "Netral"

def clean_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = re.sub("\xad", "", text)
    
    # hapus titik dibeberapa kata tertentu
    text = re.sub(" Tbk\.", " Tbk ", text)
    text = re.sub(" Tbk", " Tbk ", text)
    text = re.sub(" Rp\.", " ", text)
    text = re.sub(" Rp", " ", text)
    text = re.sub(" PT\.", " PT ", text)
    text = re.sub(" Pt\.", " PT ", text)
    text = re.sub(" dr\.", " ", text)
    text = re.sub(" Dr\.", " ", text)
    text = re.sub(" DR\.", " ", text)
    text = re.sub(" N\.A\.", " ", text)
    text = re.sub(" H\.M\.", " HM", text)    
    text = re.sub(" jl\. ", " jalan ", text)
    text = re.sub(" Jl\. ", " jalan ", text)
    text = re.sub(" Jln\. ", " jalan ", text)
    text = re.sub(" jln\. ", " jalan ", text)    

    # stop words
    text = re.sub("Berikut rincian kurs jual-beli.*$", "", text)
    text = re.sub("(Simak berita lainnya seputar topik.+)$", "", text)
    text = re.sub("--.*--", " ", text) # hapus semua kalimat yang ada di tengah -- dan --
    
    # others pattern
    text = re.sub(r"\. *[0-9]+\. ", ". ", text) # hapus angka yang menunjukkan list seperti 1. 2. yang diawali dengan titik
    text = re.sub(r": *[0-9]+\. ", ". ", text) # hapus angka yang menunjukkan list seperti 1. 2. yang diawali dengan ":"
    text = re.sub("(?<=\d)(?=[a-zA-Z])", " ", text) # memisahkan angka yang berdempetan dengan huruf
    text = re.sub("(?<=[a-zA-Z])(?=\d)", " ", text) # memisahkan huruf yang berdempetan dengan angka
    text = re.sub(" ([a-zA-Z]{1,2})\. ", " ", text) # hapus dua huruf yang diikuti titik karena biasa hanya singkatan nama
    
    # hapus url
    text = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', "link-website", text)
    
    text = re.sub("\*", "", text) # hapus bintang
    text = re.sub(r"(?<=\d)[,.](?=\d)", "", text) # hapus titik atau koma ditengah angka
    text = re.sub(":", ": ", text) # menambahkan spasi setelah :
    text = re.sub(r"\.(?=\S)", ". ", text) # ada spasi setelah titik
    text = re.sub(r"\?(?=\S)", "? ", text) # ada spasi setelah tanda tanya
    text = re.sub(r"\)(?=\S)", ") ", text) # ada spasi setelah tutup kurung
    text = re.sub(r"\((?=\S)", " (", text) # ada spasi sebelum tutup kurung

    text = re.sub("\. +\.", ".", text) # hapus spasi diantara dua titik
    text = re.sub(" +\? +", "? ", text) # hapus spasi sebelum tanda tanya
    text = re.sub(" +,", ", ", text) # hapus spasi sebelum koma
    text = re.sub(" +\. ", ". ", text) # hapus spasi sebelum titik
    text = re.sub("\.,", ", ", text) # hapus .,
    
    text = re.sub(" {2,}", " ", text) # hapus spasi yang berulang
    text = re.sub("\.{2,}", ".", text) # hapus titik yang berulang
    text = re.sub("\?{2,}", ".", text) # hapus tanda tanya yang berulang
    
    text = re.sub(r'\. *\([^)]*\)[\. ]', ". ", text) # hapus kurung di awal kalimat
    text = re.sub(r'\. *\([^)]*\)$', ".", text) # hapus kurung di akhir artikel
    
    text = re.sub(" Bisnis\.com", " bisniscom", text)
    text = re.sub(" Bisnis\. com", " bisniscom", text)
    text = re.sub(" bisnis\. com", " bisniscom", text)
    text = re.sub(" bisnis\.com", " bisniscom", text)
    
    return text.strip()

## VADER

In [14]:
sid = SentimentIntensityAnalyzer()

def veder_sentiment(row, column):
    """Callback for dataframe to do sentiment analysis
    with VADER. Dataframe must contain translate_text column
    """
    # print(row['id'])
    text = row[column]
    if text is not None:     
        per_sentence = []
        sentences = nltk.sent_tokenize(text)
        compound = neg = pos = neu = 0
        sentence_length = len(sentences)
        for i, sentence in zip(range(1, sentence_length + 1), sentences):
            sentiment = sid.polarity_scores(sentence.replace('"', ""))    
            compound += sentiment['compound']
            neg += sentiment['neg']
            pos += sentiment['pos']
            neu += sentiment['neu']
            per_sentence.append({
                "sentence_ke" : i,
                "sentence" : sentence,
                "sentiment" : sentiment
            })
        
        row['mean_compound'] = compound / sentence_length
        row['mean_pos'] = pos / sentence_length
        row['mean_neg'] = neg / sentence_length
        row['mean_neu'] = neu / sentence_length
        row['per_sentence'] = per_sentence
        
    return row

## Main Program

In [None]:
with open("data/translate.txt", "r", encoding='utf-8') as f:
    data_translate = json.load(f)

data_translate = pd.DataFrame(data_translate)
data = pd.read_json("data/table_articles.json")

data = (data.set_index("link")
        .join(data_translate.set_index("link"))
        .reset_index())

data['content_clean'] = data.content.apply(clean_text)

data_veder = data.apply(lambda row : veder_sentiment(row, 'translate_text'), axis=1)
data_veder.to_json('data/data_vader.json')

data_veder