In [1]:
import pandas as pd 
import numpy as np
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
df = pd.read_csv('komentar_bersih_instagram.csv', usecols=['clean_comment']).astype('str')

In [3]:
df.head(10)

Unnamed: 0,clean_comment
0,aneh aneh aje
1,nambah beban rakyat aja dih
2,
3,banget bangettt bangettt jalan aja bayar skrg ...
4,ngga paham pikir jaman now ketindesss aje moga...
5,gagal perintah kota selesai ujung rakyat sengs...
6,bener din gitu bayar pajak teh taun naha kitu ...
7,bebas pajak
8,rakyat ud susah bikin susah allahhh
9,blok


In [4]:
# LEXICON
lexicon = pd.read_csv('lexicon.csv')
lexicon['weight'] = lexicon['sentiment'].map({'positive':1, 'negative':-1}) 
lexicon = dict(zip(lexicon['word'], lexicon['weight']))

In [5]:
print(lexicon)

{'absah': 1, 'absolut': 1, 'acuan': 1, 'afdol': 1, 'ahli': 1, 'akademisi': 1, 'akan bayar': 1, 'akbar': 1, 'akrab': 1, 'aktif': 1, 'aktualisasi': 1, 'akur': 1, 'akurat': 1, 'alami': 1, 'alamiah': 1, 'alhamdulillah': 1, 'alim': 1, 'alim ulama': 1, 'amal': 1, 'amal jariah': 1, 'aman': 1, 'aman-aman': 1, 'amboi': 1, 'ampuh': 1, 'anak emas': 1, 'andal': 1, 'andalan': 1, 'anggukan': 1, 'anggun': 1, 'animo': 1, 'anjuran': 1, 'anteng': 1, 'antusias': 1, 'antusiasme': 1, 'anugrah': 1, 'apresiasi': 1, 'arahan': 1, 'atensi': 1, 'ayo': 1, 'bagus': 1, 'bahagia': 1, 'bahagiakan': 1, 'bahagian': 1, 'bahu-membahu': 1, 'baik': 1, 'baik-baik': 1, 'bajik': 1, 'bakat': 1, 'bakti': 1, 'bala bantuan': 1, 'balas budi': 1, 'balas jasa': 1, 'bangga': 1, 'bangkit': 1, 'bantu': 1, 'bantuan': 1, 'banyak': 1, 'banyak-banyak': 1, 'beasiswa': 1, 'bebas banjir': 1, 'bebas hambatan': 1, 'bebas murni': 1, 'becus': 1, 'bekal': 1, 'beken': 1, 'bekerja keras': 1, 'bekerja sama': 1, 'belajar': 1, 'belas kasih': 1, 'benah'

In [6]:
# NEGATIVE WORDS
negative_words = list(open("negative.txt"))
negative_words = list([word.rstrip() for word in negative_words])

In [7]:
print(negative_words)

['abnormal', 'absurd', 'acak', 'acak-acakan', 'acuh', 'acuh tak acuh', 'adiktif', 'adil', 'agresi', 'agresif', 'agresor', 'aib', 'air terjun', 'akurat', 'alarm', 'alasan', 'alat permainan', 'alergi', 'alergik', 'amat ketakutan', 'amat panas', 'ambigu', 'ambivalen', 'ambivalensi', 'amoral', 'amoralitas', 'ampun', 'amuk', 'anak nakal', 'anak yatim', 'anarki', 'anarkis', 'anarkisme', 'ancaman', 'aneh', 'aneh lagi', 'anehnya', 'angkuh', 'angriness', 'anjing', 'anjlok', 'anomali', 'antagonis', 'antagonisme', 'antek', 'anti-', 'anti-Amerika', 'anti-Israel', 'anti-kita', 'anti-pendudukan', 'anti-proliferasi', 'anti-putih', 'anti-Semit', 'antipati', 'antisosial', 'antitesis', 'apak', 'apati', 'apatis', 'apek', 'apokaliptik', 'apologis', 'argumentatif', 'artinya jika', 'asam', 'asap', 'asem', 'asing', 'astaghfirullah', 'asusila', 'awan', 'awas', 'babi', 'badai', 'bahan tertawaan', 'bahaya', 'bajingan', 'baju kotor', 'balas dendam', 'bandel', 'bandot', 'bangkrut', 'bantingan', 'banyak sekali', '

In [8]:
comment_polarity = [] 
comment_weight = []
negasi = False

for sentence in df['clean_comment']: 
  sentence_score = 0 
  sentence_weight = "" 
  sentiment_count = 0 
  sentence = sentence.split()
  for word in sentence:
    try:
      score = lexicon[word]
      sentiment_count = sentiment_count + 1
    except:
      score = 99
    
    if(score == 99):
      if (word in negative_words): 
        negasi = True
        sentence_score = sentence_score - 1
        sentence_weight = sentence_weight + " - "+ str(1)
      else:
            sentence_score = sentence_score + 0 
            sentence_weight = sentence_weight + " + "+ str(0)
    else:
      if(negasi == True):
        sentence_score = sentence_score + (score * -1.0)
        sentence_weight = sentence_weight + " + ("+ str(score) + " * -1 "+") " 
        negasi = False
      else:
        sentence_score = sentence_score + score 
        sentence_weight = sentence_weight + " + "+ str(score)
        
  comment_weight.append(sentence_weight[1:] +" = " + str(sentence_score)) 
  if sentence_score > 0:
    comment_polarity.append('positive') 
  elif sentence_score < 0:
    comment_polarity.append('negative') 
  else:
    comment_polarity.append('neutral') 

results = pd.DataFrame({
    "comment" : df['clean_comment'], 
    "label" : comment_polarity, 
    "weight" : comment_weight
    })
results['label'].value_counts()
results[['comment', 'label']].to_csv('labeling-data-instagram.csv', encoding ='utf8', index = False)


In [9]:
results.head(20)

Unnamed: 0,comment,label,weight
0,aneh aneh aje,negative,- 1 - 1 + 0 = -2
1,nambah beban rakyat aja dih,negative,+ 0 - 1 + 0 + 0 + 0 = -1
2,,neutral,+ 0 = 0
3,banget bangettt bangettt jalan aja bayar skrg ...,negative,+ 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + ...
4,ngga paham pikir jaman now ketindesss aje moga...,negative,+ 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + (1 * -1 ) =...
5,gagal perintah kota selesai ujung rakyat sengs...,negative,- 1 + 0 + 0 + (1 * -1 ) + 0 + 0 - 1 + 0 + 0 +...
6,bener din gitu bayar pajak teh taun naha kitu ...,neutral,+ 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 = 0
7,bebas pajak,neutral,+ 0 + 0 = 0
8,rakyat ud susah bikin susah allahhh,negative,+ 0 + 0 - 1 + 0 - 1 + 0 = -2
9,blok,neutral,+ 0 = 0
