<a href="https://colab.research.google.com/github/fatuunreal/stki4/blob/main/stki4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install ekphrasis



In [6]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
    # terms that will be annotated
    #annotate={"hashtag", "allcaps", "elongated", "repeated",'emphasis', 'censored'},
    annotate={"hashtag","allcaps","elongated","repeated",'emphasis','censored'},
    fix_html=True,  # fix HTML tokens

    # corpus from which the word statistics are going to be used
    # for word segmentation
    segmenter="twitter",

    # corpus from which the word statistics are going to be used
    # for spell correction
    corrector="twitter",

    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words

    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,

    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


**Load Library**

In [7]:
import re
import string
import pandas as pd
from copy import deepcopy

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

**Load Datasets**

In [89]:
df = pd.read_csv('tweet-jokowi.csv')

In [90]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,Label
0,0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,positif
1,1,@jokowi harga2 pd naik gaji aparat negara gk n...,negatif
2,2,xx : kalian coba mengusir yang mau membersihka...,negatif
3,3,@jokowi haturnuhun bapak presiden @jokowi tela...,netral
4,4,@rifanrobani @catatan_ali7 @erickthohir @jokow...,netral


In [107]:
# Map the labels to numeric values
label_mapping = {'positif': 2, 'netral': 1, 'negatif': 0}
df['Label'] = df['Label'].map(label_mapping)

# Convert 'Label' column to float64
df['Label'] = df['Label'].astype('float64')

In [108]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1006 non-null   int64  
 1   text        1006 non-null   object 
 2   Label       0 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 23.7+ KB


# Preprosesing

### setting fungsi

In [14]:
def bersih_data(text):
    return " ".join(text_processor.pre_process_doc(text))

def non_ascii(text):
    return text.encode('ascii', 'replace').decode('ascii')

def remove_space_alzami(text):
    return " ".join(text.split())

def remove_emoji_alzami(text):
    return ' '.join(re.sub("([x#][A-Za-z0-9]+)"," ", text).split())

def remove_tab(text):
    return text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")

def remove_tab2(text):
    return re.sub('\s+',' ',text)

def remove_rt(text):
    return text.replace('RT'," ")

def remove_mention(text):
    return ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())

def remove_incomplete_url(text):
    return text.replace("http://", " ").replace("https://", " ")

def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

def change_stripe(text):
    return text.replace('-'," ")

def lower(text):
    return text.lower()

def remove_punctuation(text):
    remove = string.punctuation
    remove = remove.replace("_", "") # don't remove hyphens
    pattern = r"[{}]".format(remove) # create the pattern
    return re.sub(pattern, "", text)

## running preprosesing

In [16]:
final_string = []
for text in df['text'].values:
    EachReviewText = ""
    proc = bersih_data(text)
    proc = remove_rt(proc)
    proc = lower(proc)
    proc = change_stripe(proc)
    proc = remove_tab(proc)
    proc = remove_tab2(proc)
    proc = non_ascii(proc)
    proc = remove_incomplete_url(proc)
    proc = remove_single_char(proc)
    proc = remove_punctuation(proc)
    proc = remove_space_alzami(proc)
    EachReviewText = proc
    final_string.append(EachReviewText)

masukkan hasil preprocessing ke kolom step01

In [17]:
df["step01"] = final_string

In [19]:
df.head(14)

Unnamed: 0.1,Unnamed: 0,text,Label,step01
0,0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...
1,1,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...
2,2,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...
3,3,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...
4,4,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya
5,5,rt @_delightbee: @jokowi @pertamina @kemenbumn...,2,rt user user user user kapan lagi ya pak rakya...
6,6,@jokowi @pertamina @kemenbumn kapan lagi ya pa...,2,user user user kapan lagi ya pak rakyat mu rin...
7,7,presiden @jokowi memberikan keterangan pers sa...,2,presiden user memberikan keterangan pers saat ...
8,8,rt @jokowi: saat dunia mengalami kekurangan pa...,2,rt user saat dunia mengalami kekurangan pangan...
9,9,rt @jokowi: situasi dunia sedang terdisrupsi d...,2,rt user situasi dunia sedang terdisrupsi di bi...


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006 entries, 0 to 1005
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1006 non-null   int64 
 1   text        1006 non-null   object
 2   Label       1005 non-null   object
 3   step01      1006 non-null   object
dtypes: int64(1), object(3)
memory usage: 31.6+ KB


hapus record yang kosong

In [21]:
df_hapus = df[~df['step01'].str.contains(" ")]

In [22]:
df_new = df[~df.isin(df_hapus)].dropna()

In [23]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1002 entries, 0 to 1005
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1002 non-null   float64
 1   text        1002 non-null   object 
 2   Label       1002 non-null   object 
 3   step01      1002 non-null   object 
dtypes: float64(1), object(3)
memory usage: 39.1+ KB


## normalisasi kata slang

In [36]:
# token
import nltk

# Download the necessary NLTK data package
nltk.download('punkt_tab')  # Download the punkt_tab data

from nltk.tokenize import word_tokenize


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [37]:
def word_tokenize_wrapper(text):
  return word_tokenize(text)

In [39]:
df_new['tokens'] = df['step01'].apply(word_tokenize_wrapper)

In [40]:
df_new.head(10)

Unnamed: 0.1,Unnamed: 0,text,Label,step01,tokens
0,0.0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...,"[rt, user, user, user, user, negara, demokrasi..."
1,1.0,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...,"[user, harga2, pd, naik, gaji, aparat, negara,..."
2,2.0,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...,"[xx, kalian, coba, mengusir, yang, mau, member..."
3,3.0,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...,"[user, haturnuhun, bapak, presiden, user, tela..."
4,4.0,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya,"[user, user, user, user, bahkan, bisa, jadi, t..."
5,5.0,rt @_delightbee: @jokowi @pertamina @kemenbumn...,2,rt user user user user kapan lagi ya pak rakya...,"[rt, user, user, user, user, kapan, lagi, ya, ..."
6,6.0,@jokowi @pertamina @kemenbumn kapan lagi ya pa...,2,user user user kapan lagi ya pak rakyat mu rin...,"[user, user, user, kapan, lagi, ya, pak, rakya..."
7,7.0,presiden @jokowi memberikan keterangan pers sa...,2,presiden user memberikan keterangan pers saat ...,"[presiden, user, memberikan, keterangan, pers,..."
8,8.0,rt @jokowi: saat dunia mengalami kekurangan pa...,2,rt user saat dunia mengalami kekurangan pangan...,"[rt, user, saat, dunia, mengalami, kekurangan,..."
9,9.0,rt @jokowi: situasi dunia sedang terdisrupsi d...,2,rt user situasi dunia sedang terdisrupsi di bi...,"[rt, user, situasi, dunia, sedang, terdisrupsi..."


In [42]:
normalized_word = pd.read_excel("kamus perbaikan kata.xlsx")

normalized_word_dict = {}

for index, row in normalized_word.iterrows():
    if row[0] not in normalized_word_dict:
        normalized_word_dict[row[0]] = row[1]

def normalized_term(document):
    return [normalized_word_dict[term] if term in normalized_word_dict else term for term in document]

df_new['tokens_perbaikan'] = df_new['tokens'].apply(normalized_term)

  if row[0] not in normalized_word_dict:
  normalized_word_dict[row[0]] = row[1]


In [43]:
df_new.head()

Unnamed: 0.1,Unnamed: 0,text,Label,step01,tokens,tokens_perbaikan
0,0.0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...,"[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi..."
1,1.0,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...,"[user, harga2, pd, naik, gaji, aparat, negara,...","[user, harga2, pada, naik, gaji, aparat, negar..."
2,2.0,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...,"[xx, kalian, coba, mengusir, yang, mau, member...","[xx, kalian, coba, mengusir, yang, mau, member..."
3,3.0,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...,"[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, bapak, presiden, user, tela..."
4,4.0,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya,"[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, bahkan, bisa, jadi, t..."


In [44]:
df_new.iloc[0].tokens

['rt',
 'user',
 'user',
 'user',
 'user',
 'negara',
 'demokrasi',
 'itu',
 'hukumnya',
 'hrs',
 'tegas',
 'dan',
 'adil',
 'ibarat',
 'kereta',
 'api',
 'dan',
 'relnya',
 'kereta',
 'api']

In [45]:
df_new.iloc[0].tokens_perbaikan

['rt',
 'user',
 'user',
 'user',
 'user',
 'negara',
 'demokrasi',
 'itu',
 'hukumnya',
 'harus',
 'tegas',
 'dan',
 'adil',
 'ibarat',
 'kereta',
 'api',
 'dan',
 'relnya',
 'kereta',
 'api']

# Stopword - Menggunakan Modul NLTK

In [46]:
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [47]:
stopword = nltk.corpus.stopwords.words('indonesian')

def remove_stopwords(text):
  text = [word for word in text if word not in stopword]
  return text

df_new['Stop_removal'] = df_new['tokens_perbaikan'].apply(lambda x: remove_stopwords(x))
df_new

Unnamed: 0.1,Unnamed: 0,text,Label,step01,tokens,tokens_perbaikan,Stop_removal
0,0.0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...,"[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi..."
1,1.0,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...,"[user, harga2, pd, naik, gaji, aparat, negara,...","[user, harga2, pada, naik, gaji, aparat, negar...","[user, harga2, gaji, aparat, negara, naik2, en..."
2,2.0,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...,"[xx, kalian, coba, mengusir, yang, mau, member...","[xx, kalian, coba, mengusir, yang, mau, member...","[xx, coba, mengusir, membersihkan, air, planet..."
3,3.0,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...,"[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, presiden, user, berkenan, k..."
4,4.0,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya,"[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, titisannya]"
...,...,...,...,...,...,...,...
1001,1001.0,"Memang hebat orang pilihan pak @jokowi ini, lu...",2,memang hebat orang pilihan pak user ini lulusa...,"[memang, hebat, orang, pilihan, pak, user, ini...","[memang, hebat, orang, pilihan, pak, user, ini...","[hebat, orang, pilihan, user, lulusan, allcaps..."
1002,1002.0,Kunjungan Kehormatan Presiden @jokowi dan Ibu ...,2,kunjungan kehormatan presiden user dan ibu iri...,"[kunjungan, kehormatan, presiden, user, dan, i...","[kunjungan, kehormatan, presiden, user, dan, i...","[kunjungan, kehormatan, presiden, user, iriana..."
1003,1003.0,Berbagai upaya dilakukan Indonesia untuk melak...,2,berbagai upaya dilakukan indonesia untuk melak...,"[berbagai, upaya, dilakukan, indonesia, untuk,...","[berbagai, upaya, dilakukan, indonesia, untuk,...","[upaya, indonesia, evakuasi, mengamankan, allc..."
1004,1004.0,Sebagai tindak lanjut perintah Presiden @jokow...,2,sebagai tindak lanjut perintah presiden user s...,"[sebagai, tindak, lanjut, perintah, presiden, ...","[sebagai, tindak, lanjut, perintah, presiden, ...","[tindak, perintah, presiden, user, mencegah, t..."


In [48]:
df_new.iloc[0].tokens_perbaikan

['rt',
 'user',
 'user',
 'user',
 'user',
 'negara',
 'demokrasi',
 'itu',
 'hukumnya',
 'harus',
 'tegas',
 'dan',
 'adil',
 'ibarat',
 'kereta',
 'api',
 'dan',
 'relnya',
 'kereta',
 'api']

In [49]:
df_new.iloc[0].Stop_removal

['rt',
 'user',
 'user',
 'user',
 'user',
 'negara',
 'demokrasi',
 'hukumnya',
 'adil',
 'kereta',
 'api',
 'relnya',
 'kereta',
 'api']

In [50]:
i=0
final_string_tokens = []
for text in df_new['Stop_removal'].values:
    EachReviewText = ""
    EachReviewText = ' '.join(text)
    final_string_tokens.append(EachReviewText)

In [51]:
df_new["step02"] = final_string_tokens

In [52]:
df_new.head()

Unnamed: 0.1,Unnamed: 0,text,Label,step01,tokens,tokens_perbaikan,Stop_removal,step02
0,0.0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...,"[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi...",rt user user user user negara demokrasi hukumn...
1,1.0,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...,"[user, harga2, pd, naik, gaji, aparat, negara,...","[user, harga2, pada, naik, gaji, aparat, negar...","[user, harga2, gaji, aparat, negara, naik2, en...",user harga2 gaji aparat negara naik2 enak mah ...
2,2.0,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...,"[xx, kalian, coba, mengusir, yang, mau, member...","[xx, kalian, coba, mengusir, yang, mau, member...","[xx, coba, mengusir, membersihkan, air, planet...",xx coba mengusir membersihkan air planet mengg...
3,3.0,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...,"[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, presiden, user, berkenan, k...",user haturnuhun presiden user berkenan kunker ...
4,4.0,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya,"[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, titisannya]",user user user user titisannya


# Stemming - Menggunakan Sastrawi

In [53]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m143.4/209.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [54]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [55]:
final_string = []
s = ""
for sentence in df_new["step02"].values:
    filteredSentence = []
    EachReviewText = ""
    s = (stemmer.stem(sentence))
    filteredSentence.append(s)

    EachReviewText = ' '.join(filteredSentence)
    final_string.append(EachReviewText)

In [56]:
df_new["ProcessedText"] = final_string

In [57]:
df_new.head(10)

Unnamed: 0.1,Unnamed: 0,text,Label,step01,tokens,tokens_perbaikan,Stop_removal,step02,ProcessedText
0,0.0,rt @megatop99: @conannkri @ccicpolri @mohmahfu...,2,rt user user user user negara demokrasi itu hu...,"[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi...","[rt, user, user, user, user, negara, demokrasi...",rt user user user user negara demokrasi hukumn...,rt user user user user negara demokrasi hukum ...
1,1.0,@jokowi harga2 pd naik gaji aparat negara gk n...,0,user harga2 pd naik gaji aparat negara gk naik...,"[user, harga2, pd, naik, gaji, aparat, negara,...","[user, harga2, pada, naik, gaji, aparat, negar...","[user, harga2, gaji, aparat, negara, naik2, en...",user harga2 gaji aparat negara naik2 enak mah ...,user harga2 gaji aparat negara naik2 enak mah ...
2,2.0,xx : kalian coba mengusir yang mau membersihka...,0,xx kalian coba mengusir yang mau membersihkan ...,"[xx, kalian, coba, mengusir, yang, mau, member...","[xx, kalian, coba, mengusir, yang, mau, member...","[xx, coba, mengusir, membersihkan, air, planet...",xx coba mengusir membersihkan air planet mengg...,xx coba usir bersih air planet gagal tukar air...
3,3.0,@jokowi haturnuhun bapak presiden @jokowi tela...,1,user haturnuhun bapak presiden user telah berk...,"[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, bapak, presiden, user, tela...","[user, haturnuhun, presiden, user, berkenan, k...",user haturnuhun presiden user berkenan kunker ...,user haturnuhun presiden user kenan kunker kam...
4,4.0,@rifanrobani @catatan_ali7 @erickthohir @jokow...,1,user user user user bahkan bisa jadi titisannya,"[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, bahkan, bisa, jadi, t...","[user, user, user, user, titisannya]",user user user user titisannya,user user user user titis
5,5.0,rt @_delightbee: @jokowi @pertamina @kemenbumn...,2,rt user user user user kapan lagi ya pak rakya...,"[rt, user, user, user, user, kapan, lagi, ya, ...","[rt, user, user, user, user, kapan, lagi, iya,...","[rt, user, user, user, user, iya, rakyat, mu, ...",rt user user user user iya rakyat mu rindu,rt user user user user iya rakyat mu rindu
6,6.0,@jokowi @pertamina @kemenbumn kapan lagi ya pa...,2,user user user kapan lagi ya pak rakyat mu rin...,"[user, user, user, kapan, lagi, ya, pak, rakya...","[user, user, user, kapan, lagi, iya, pak, raky...","[user, user, user, iya, rakyat, mu, rindu, coh...",user user user iya rakyat mu rindu cohxss8jwrgr,user user user iya rakyat mu rindu cohxss8jwrgr
7,7.0,presiden @jokowi memberikan keterangan pers sa...,2,presiden user memberikan keterangan pers saat ...,"[presiden, user, memberikan, keterangan, pers,...","[presiden, user, memberikan, keterangan, pers,...","[presiden, user, keterangan, pers, kunjungan, ...",presiden user keterangan pers kunjungan kerja ...,presiden user terang pers kunjung kerja pasar ...
8,8.0,rt @jokowi: saat dunia mengalami kekurangan pa...,2,rt user saat dunia mengalami kekurangan pangan...,"[rt, user, saat, dunia, mengalami, kekurangan,...","[rt, user, saat, dunia, mengalami, kekurangan,...","[rt, user, dunia, mengalami, kekurangan, panga...",rt user dunia mengalami kekurangan pangan kond...,rt user dunia alami kurang pangan kondisi sedi...
9,9.0,rt @jokowi: situasi dunia sedang terdisrupsi d...,2,rt user situasi dunia sedang terdisrupsi di bi...,"[rt, user, situasi, dunia, sedang, terdisrupsi...","[rt, user, situasi, dunia, sedang, terdisrupsi...","[rt, user, situasi, dunia, terdisrupsi, bidang...",rt user situasi dunia terdisrupsi bidang panga...,rt user situasi dunia disrupsi bidang pangan e...


# simpan hasil preprocessing

In [59]:
df_new.to_csv('clean_dataset_part01.csv',sep=";")

# Penataan Data

## pembagian fitur dan label

In [60]:
import pandas as pd
import numpy as np

In [61]:
dataset = pd.read_csv('clean_dataset_part01.csv',sep=";")

In [62]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.1      1002 non-null   int64  
 1   Unnamed: 0        1002 non-null   float64
 2   text              1002 non-null   object 
 3   Label             1002 non-null   object 
 4   step01            1002 non-null   object 
 5   tokens            1002 non-null   object 
 6   tokens_perbaikan  1002 non-null   object 
 7   Stop_removal      1002 non-null   object 
 8   step02            1002 non-null   object 
 9   ProcessedText     1002 non-null   object 
dtypes: float64(1), int64(1), object(8)
memory usage: 78.4+ KB


In [63]:
dataset_feature = dataset['ProcessedText'].astype(str)

In [64]:
dataset_feature.head(10)

Unnamed: 0,ProcessedText
0,rt user user user user negara demokrasi hukum ...
1,user harga2 gaji aparat negara naik2 enak mah ...
2,xx coba usir bersih air planet gagal tukar air...
3,user haturnuhun presiden user kenan kunker kam...
4,user user user user titis
5,rt user user user user iya rakyat mu rindu
6,user user user iya rakyat mu rindu cohxss8jwrgr
7,presiden user terang pers kunjung kerja pasar ...
8,rt user dunia alami kurang pangan kondisi sedi...
9,rt user situasi dunia disrupsi bidang pangan e...


In [66]:
dataset_label = dataset['Label']

In [67]:
dataset_label.head(10)

Unnamed: 0,Label
0,2
1,0
2,0
3,1
4,1
5,2
6,2
7,2
8,2
9,2


In [76]:
# prompt: ubah dtype untuk Label dari objetct ke float64

dataset_label = dataset['Label'].astype('float64')
dataset_label.head(10)

ValueError: could not convert string to float: 'Negatif'

In [68]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [75]:
# Visualizing the target variable
plt.figure(figsize=(12,8))
sns.distplot(dataset_label, label=f'target, skew: {dataset_label.skew():.2f}')
plt.legend(loc='best')
plt.show()

TypeError: could not convert string to float: 'Negatif'

<Figure size 1200x800 with 0 Axes>