In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
stp = stopwords.words('indonesian')
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
# Preprocessing
def lower(text):
    # Case Folding
    return text.lower()

In [None]:
# Remove Emoticon
def remove_punctuation(text):
    # Happy Emoticons
    emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', ':d', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
    # Sad Emoticons
    emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
    # All emoticons (happy + sad)
    emoticons = emoticons_happy.union(emoticons_sad)

    text = ' '.join([word for word in text.split() if word not in emoticons])

    text = re.sub(r'@[\w]*', ' ', text)

    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', text)

    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)

    text = re.sub(r'^RT[\s]+', ' ', text)  

    text = text.lower()  

    text = re.sub(r'[^\w\s]+', ' ', text)

    text = re.sub(r'[0-9]+', ' ', text)

    text = re.sub(r'_', ' ', text)

    text = re.sub(r'\$\w*', ' ', text)
    
    return text

In [None]:
# Stopword Removal
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in stp])
    return text

In [None]:
# Stemming
def stem_text(text):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

In [None]:
# For Testing
def preprocess_data(text):
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = stem_text(text)
    return text

In [None]:
# Open data file .csv
tweets_df = pd.read_csv('../Dataset/Data_mentah.csv', encoding='latin-1')

In [None]:
tweets_df.drop(['Date','Username','Location'], axis=1, inplace=True)

In [None]:
# Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional 
tweets_df['remove_punctuation'] = tweets_df['Text'].map(lambda x: remove_punctuation(x))
tweets_df

In [None]:
# Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional # Optional 
tweets_df['remove_stopwords'] = tweets_df['remove_punctuation'].map(lambda x: remove_stopwords(remove_punctuation(x)))
tweets_df

In [None]:
# For all step cleansing 
tweets_df['Text_bersih'] = tweets_df['Text'].map(lambda x: stem_text(remove_stopwords(remove_punctuation(x))))
tweets_df

In [None]:
# Drop Column
tweets_df.drop(tweets_df.columns[[0, 1, 2]], axis=1, inplace=True)

In [None]:
# Drop Duplicate
tweets_df.drop_duplicates(subset ="Text_bersih", keep = 'first', inplace = True)
tweets_df

In [None]:
# Save data crawling to file .csv
tweets_df.to_csv('../Dataset/Bersih.csv', index = False, header = True)