In [27]:
import os
import re
import csv
import nltk
import enchant
import preprocessor
from collections import defaultdict
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [28]:
en_dict = enchant.Dict("en_US")
id_words = []

with open('dictionary/wordlist-id.txt', 'r') as file:
    for word in file:
        id_words.append(word)

In [29]:
def casefolding(review):
    review = review.lower()
    return review
  
def filtering(review):
    review = re.sub(r'@[^\s]+', '', review)  # @username
    review = re.sub(r'#([^\s]+)', '', review)  # hashtag
    review = re.sub(r'https:[^\s]+', '', review)  # URL links
    review = re.sub(r"[.,:;+!\-_<^/=?\"'\(\)\d\*]", " ", review)  # symbol, char
    review = re.sub(r'[^\x00-\x7f]+', '', review)  # non ASCII chars
    review = re.sub(r'\s+', ' ', review)  # duplicate whitespace
    return preprocessor.clean(review)
  
def tokenizing(review):
    token = nltk.word_tokenize(review)
    return token
  
def stemming(review):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    review = stemmer.stem(review)
    return review
  
def stop_word_removing(review):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    review = stopword.remove(review)
    return review
  
def slang_word_converting(review):
    slangwords = {}
    with open('dictionary/slangword-id.txt') as file:
        for line in file:
            words = line.split('=')
            slangwords[words[0]] = words[1].strip()
    
    result = []            
    for word in review.split():
        if word in slangwords:
            word = slangwords[word]
        result.append(word)
    result = ' '.join(result)

    return result
  
def character_repeating(review):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    temp = ''
    for word in review.split():
        if word != '':
            if en_dict.check(word):
                temp += word + ' '
            elif word in id_words:
                temp += word + ' '
            else:
                temp += pattern.sub(r"\1", word) + ' '
    
    return temp

In [30]:
def preprocess_tweet(filename):
    columns = defaultdict(list)
    with open(filename, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            for (k, v) in row.items():
                columns[k].append(v)

    username = filename.split("@")[-1].replace(".csv", "")
    
    output_file = "dataset/processed/processed-tweets-@" + username + ".csv"
    with open(output_file, 'w') as processed_file:
        processed_file.write('username,tweet\n')
        for tweet in columns['text']:
            tweet = filtering(str(tweet))
            tweet = casefolding(str(tweet))
            tweet = character_repeating(str(tweet))
            tweet = stemming(str(tweet))
            tweet = stop_word_removing(str(tweet))
            tweet = slang_word_converting(str(tweet))
            if tweet != '':
                processed_file.write(str(username) + ',' + str(tweet))
                processed_file.write('\n')

    print(f"Preprocessing @{username} tweets completed.")

In [31]:
dataset_path = "dataset/crawled"
dataset_files = os.listdir(dataset_path)
for file in dataset_files[:5]:
  if file.endswith(".csv"):
    filename = os.path.join(dataset_path, file)
    preprocess_tweet(filename)

Preprocessing @9ita7unn tweets completed.
Preprocessing @a2lir tweets completed.
Preprocessing @abcdenjiii tweets completed.
Preprocessing @achadianrani tweets completed.
Preprocessing @Adamumemo tweets completed.
