In [1]:
import pandas as pd
import os
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.tag import CRFTagger
from sklearn.feature_extraction.text import CountVectorizer
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_anis = pd.read_csv('https://raw.github.com/ghazafm/SocialMediaSentiment/main/preprocessing/Training/data/raw/anis.csv')
df_prabowo = pd.read_csv('https://raw.github.com/ghazafm/SocialMediaSentiment/main/preprocessing/Training/data/raw/prabowo.csv')
df_ganjar = pd.read_csv('https://raw.github.com/ghazafm/SocialMediaSentiment/main/preprocessing/Training/data/raw/ganjar.csv')

df_anis.drop('Unnamed: 0',axis=1,inplace=True)
df_prabowo.drop('Unnamed: 0',axis=1,inplace=True)
df_ganjar.drop('Unnamed: 0',axis=1,inplace=True)

In [3]:
df_anis.drop_duplicates(inplace=True)
df_prabowo.drop_duplicates(inplace=True)
df_ganjar.drop_duplicates(inplace=True)
df_anis.dropna(inplace=True)
df_prabowo.dropna(inplace=True)
df_ganjar.dropna(inplace=True)

In [4]:
def remove_str_index(word,token):
    temp = [0]
    for j in word:    
        if not j == temp[-1]:
            temp.append(j)
    temp.remove(0)
    temp = ''.join(temp)
    return temp


def clear_double(data,token=False):
    temp = []
    data = data.split()
    for word in data:
        temp.append(remove_str_index(word,token))    
    if token:
        return temp
    else:
        return ' '.join(temp)

In [5]:
df_anis['no_double'] = df_anis['Tweet'].apply(clear_double)
df_prabowo['no_double'] = df_prabowo['Tweet'].apply(clear_double)
df_ganjar['no_double'] = df_ganjar['Tweet'].apply(clear_double)

In [6]:
alay = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')

In [7]:
alay = dict(zip(alay['slang'], alay['formal']))

In [8]:
def cek_alay(word, alay):
    return alay.get(word, word)


def clear_alay(data):
    words = str(data)
    words = words.split()
    cleared_words = [cek_alay(word, alay) for word in words]
    return ' '.join(cleared_words)

In [9]:
df_anis['no_alay'] = df_anis['no_double'].apply(clear_alay)
df_prabowo['no_alay'] = df_prabowo['no_double'].apply(clear_alay)
df_ganjar['no_alay'] = df_ganjar['no_double'].apply(clear_alay)

In [10]:
def tokenizer(text):
    text = word_tokenize(text)
    return text

In [11]:
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

In [12]:
tokenize_anis = df_anis['no_alay'].apply(tokenizer)
tokenize_prabowo = df_prabowo['no_alay'].apply(tokenizer)
tokenize_ganjar = df_ganjar['no_alay'].apply(tokenizer)

In [13]:
df_anis['pos'] = ct.tag_sents(tokenize_anis)
df_prabowo['pos'] = ct.tag_sents(tokenize_prabowo)
df_ganjar['pos'] = ct.tag_sents(tokenize_ganjar)

tokenize_anis = df_anis['pos']
tokenize_prabowo = df_prabowo['pos']
tokenize_ganjar = df_ganjar['pos']

In [14]:
df_anis

Unnamed: 0,Tweet,label,no_double,no_alay,pos
0,info anies presiden,Positive,info anies presiden,info anies presiden,"[(info, NN), (anies, NN), (presiden, NN)]"
1,politisi partai gerindra sandiaga uno menjawab...,Positive,politisi partai gerindra sandiaga uno menjawab...,politisi partai gerindra sandiaga uno menjawab...,"[(politisi, NN), (partai, NN), (gerindra, NN),..."
2,lanjut pak anies kita kawal sampai jadi presiden,Positive,lanjut pak anies kita kawal sampai jadi presiden,lanjut pak anies kita kawal sampai jadi presiden,"[(lanjut, VB), (pak, NN), (anies, NN), (kita, ..."
3,semoga allah swt menyelamatkan bangsa dan nega...,Positive,semoga alah swt menyelamatkan bangsa dan negar...,semoga alah swt menyelamatkan bangsa dan negar...,"[(semoga, SC), (alah, VB), (swt, NN), (menyela..."
4,chotimah kasian ya pa anies makanya sudah teka...,Positive,chotimah kasian ya pa anies makanya sudah teka...,chotimah kasihan ya apa anies makanya sudah te...,"[(chotimah, NN), (kasihan, NN), (ya, NN), (apa..."
...,...,...,...,...,...
9995,tidak ada gejolak sara selama membangun pks pu...,Negative,tidak ada gejolak sara selama membangun pks pu...,tidak ada gejolak sara selama membangun pks pu...,"[(tidak, NEG), (ada, VB), (gejolak, NN), (sara..."
9996,ubedilah mahfud md otak di balik perppu ciptak...,Negative,ubedilah mahfud md otak di balik perpu ciptake...,ubedilah mahfud md otak di balik perpu ciptake...,"[(ubedilah, NN), (mahfud, FW), (md, FW), (otak..."
9997,my presiden mranies,Negative,my presiden mranies,my presiden mranies,"[(my, FW), (presiden, FW), (mranies, FW)]"
9998,pa anies presiden,Negative,pa anies presiden,apa anies presiden,"[(apa, WH), (anies, NN), (presiden, NN)]"


In [15]:
double_meaning = [
    'jadi', 'menjadi', 'bapak', 'kalau', 'rakyat', 'siapa', 
    'apa', 'orang', 'bakal', 'sama', 'pasang', 'jelang', 'tahun', 'hari', 
    'bersama', 'mau', 'tetap', 'buat', 'for', 'bukan', 'semua', 
    'terus', 'si', 'inilah', 'kan', 'tak', 'banyak', 'meski', 'lebih', 'keputusan', 
    'final', 'paling', 'hasil', 'umum', 'tepat', 'tersebut', 'total', 'klik', 'capres', 
    'pilih', 'pemilihan', 'terpilih', 'survei', 'survey', 'pemilu', 'terkait', 'fahnoor', 
    'nan', 'calon', 'pilpres', 'resmi', 'cocok', 'politik', 'ribuan', 'ratusan', 'nama','maju',
    'hut', 'dapat', 'semoga', 'beliau', 'besar', 'makin', 'layak', 'partai', 'mendukung', 'dukung', 
    'dukungan', 'gubernur', 'masyarakat', 'warga','presiden','ri','inismyname','pilpres','nan','calon','indonesia','survei','survey','pemilu',
    'aa','aah','aak','aan'
]

name = [
    'inismyname', 'indonesia', 'rosiade', 'joko', 'jokowi', 'widodo', 'ridwan', 'kamil', 'rosiade', 
    'thohir', 'mujani', 'erick', 'saiful', 'chotimah', 'ahy', 'bukan', 'aniesahy', 'ahmad', 
    'pks', 'pdip', 'jawa', 'puan', 'maharani', 'pan', 'jateng', 'tengah', 'megawati', 'ppp', 
    'rasyid', 'gerindra', 'nasdem', 'demokrat', 'pkb', 'allah', #maaffff,
    'anis', 'anies', 'baswedan', 'prabowo', 'subianto', 'ganjar', 'pranowo','fahnoor', 'amien','sandiaga',
    'chotimah', 'uno','aanies'
]
def clean_manual(data,token=False):
    temp = []
    if token:
        for tup in data:
            if tup[0] in double_meaning or tup[0] in name:
                continue
            temp.append(tup)
        return temp
    else:
        for tup in data:
            if tup[0] in double_meaning or tup[0] in name:
                continue
            temp.append(tup[0])
    temp = ' '.join(temp)
    return temp

In [16]:
df_anis['clean_manual'] = tokenize_anis.apply(clean_manual)
df_prabowo['clean_manual'] = tokenize_prabowo.apply(clean_manual)
df_ganjar['clean_manual'] = tokenize_ganjar.apply(clean_manual)


tokenize_anis = tokenize_anis.apply(clean_manual,token=True)
tokenize_prabowo = tokenize_prabowo.apply(clean_manual,token=True)
tokenize_ganjar = tokenize_ganjar.apply(clean_manual,token=True)

In [17]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def stemming(data,token=False):
   temp = []
   last = ''
   for tup in data:
      tup_temp = stemmer.stem(tup[0])
      don = [tup_temp,tup[1]]
      if token:
         temp.append(don)
         last = temp
      else:
         temp.append(tup_temp)
         last = ' '.join(temp)
   return last

In [18]:
# df_anis['stemmed'] = tokenize_anis.apply(stemming)
# df_prabowo['stemmed'] = tokenize_prabowo.apply(stemming)
# df_ganjar['stemmed'] = tokenize_ganjar.apply(stemming)


# tokenize_anis = tokenize_anis.apply(stemming,token=True)
# tokenize_prabowo = tokenize_prabowo.apply(stemming,token=True)
# tokenize_ganjar = tokenize_ganjar.apply(stemming,token=True)

KeyboardInterrupt: 

In [None]:
stopword = StopWordRemoverFactory().create_stop_word_remover().dictionary.words

def clear_stopwords(data,token=False):
    temp = []
    last = ''
    for tup in data:
        if(tup[0] in stopword):
            continue
        if token:
            temp.append(tup)
            last = temp
        else:
            temp.append(tup[0])
            last = ' '.join(temp)
    return last

In [None]:
df_anis['no_stopwords'] = tokenize_anis.apply(clear_stopwords)
df_prabowo['no_stopwords'] = tokenize_prabowo.apply(clear_stopwords)
df_ganjar['no_stopwords'] = tokenize_ganjar.apply(clear_stopwords)


tokenize_anis = tokenize_anis.apply(clear_stopwords,token=True)
tokenize_prabowo = tokenize_prabowo.apply(clear_stopwords,token=True)
tokenize_ganjar = tokenize_ganjar.apply(clear_stopwords,token=True)

In [None]:
def join(data):
    temp = []
    for tup in data:
        temp.append(tup[1])
    return' '.join(temp)

In [None]:
df_anis['tag'] = tokenize_anis.apply(join)
df_prabowo['tag'] = tokenize_prabowo.apply(join)
df_ganjar['tag'] = tokenize_ganjar.apply(join)

In [None]:
def removena(data):
    if len(data) == 0:
        return None
    return data

In [None]:
df_anis['no_stopwords'] = df_anis['no_stopwords'].apply(removena)
df_prabowo['no_stopwords'] = df_prabowo['no_stopwords'].apply(removena)
df_ganjar['no_stopwords'] = df_ganjar['no_stopwords'].apply(removena)

In [None]:
df_anis.isna().sum()

Tweet            0
label            0
no_double        0
no_alay          0
pos              0
clean_manual     0
stemmed          0
no_stopwords    90
tag              0
dtype: int64

In [None]:
df_anis.shape

(8913, 9)

In [None]:
df_anis.dropna(inplace=True)
df_prabowo.dropna(inplace=True)
df_ganjar.dropna(inplace=True)

df_anis.reset_index(drop=True, inplace=True)
df_prabowo.reset_index(drop=True, inplace=True)
df_ganjar.reset_index(drop=True, inplace=True)

In [None]:
df_anis.shape

(8823, 9)

In [None]:
df_anis.isna().sum()

Tweet           0
label           0
no_double       0
no_alay         0
pos             0
clean_manual    0
stemmed         0
no_stopwords    0
tag             0
dtype: int64

In [None]:
df_anis = df_anis[['Tweet','no_double','no_alay','pos','clean_manual','stemmed','no_stopwords','tag','label']]
df_prabowo = df_prabowo[['Tweet','no_double','no_alay','pos','clean_manual','stemmed','no_stopwords','tag','label']]
df_ganjar = df_ganjar[['Tweet','no_double','no_alay','pos','clean_manual','stemmed','no_stopwords','tag','label']]

In [None]:
concat = pd.concat([df_anis,df_prabowo,df_ganjar])

In [20]:
vectorizer = CountVectorizer(dtype=int)
pos_vec = vectorizer.fit(concat['tag'])

In [21]:
pickle.dump(vectorizer, open("pickle/countVectorizer_tag.pickle", "wb"))

In [None]:
temp = vectorizer.transform(df_anis['tag'])
pos_tag = pd.DataFrame(temp.toarray(), columns=vectorizer.get_feature_names_out())
df_anis = pd.concat([df_anis, pos_tag], axis=1)

In [None]:
temp = vectorizer.transform(df_prabowo['tag'])
pos_tag = pd.DataFrame(temp.toarray(), columns=vectorizer.get_feature_names_out())
df_prabowo = pd.concat([df_prabowo, pos_tag], axis=1)

In [None]:
temp = vectorizer.transform(df_ganjar['tag'])
pos_tag = pd.DataFrame(temp.toarray(), columns=vectorizer.get_feature_names_out())
df_ganjar = pd.concat([df_ganjar, pos_tag], axis=1)

In [None]:
concat = pd.concat([df_anis,df_prabowo,df_ganjar])

In [None]:
dir = os.path.abspath(os.path.join('..', 'data/clean/pos_tagging'))
df_anis.to_csv(f'{dir}/anis.csv',index=False)
df_prabowo.to_csv(f'{dir}/prabowo.csv',index=False)
df_ganjar.to_csv(f'{dir}/ganjar.csv',index=False)
concat.to_csv(f'{dir}/gabungan.csv',index=False)