In [None]:
# Import Library

import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import warnings
from tqdm.auto import tqdm
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Load Dataset

path = '../dataset/data_modelling/data_preparation.csv'
dataset = pd.read_csv(path, index_col=None)
dataset.head()

# Casefolding

In [None]:
def lowercase(text):
    return text.lower()

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text)
    text = re.sub('rt',' ',text)
    text = re.sub('user',' ',text)
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text)
    text = re.sub('  +', ' ', text)
    return text
    
def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text)
    return text

def casefold(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

dataset['Casefolding'] = tqdm(dataset['Tweet'].apply(casefold))
dataset.head()

In [None]:
dataset.to_csv("../dataset/data_modelling/data_casefolding.csv", index=False)

In [None]:
for index, text in enumerate(dataset['Casefolding'][100:110]):
  print('Review %d:\n'%(index+1),text)

# Stopwords

In [None]:
alay_dict = pd.read_csv('../dataset/stopwords/kamusalay.csv', encoding='latin-1', header=None)
alay_dict = alay_dict.rename(columns={0: 'original', 1: 'replacement'})
id_stopword = pd.read_csv('../dataset/stopwords/idstopwords.csv', header=None)
id_stopword = id_stopword.rename(columns={0: 'stopword'})

In [None]:
def normalize_alay(text):
    alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

def remove_stopword(text):
    text = ' '.join(['' if word in id_stopword.stopword.values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text)
    text = text.strip()
    return text

def stopword(text):
    text = normalize_alay(text)
    text = remove_stopword(text)
    return text

dataset['Stopwords'] = tqdm(dataset['Casefolding'].apply(stopword))
dataset.head()

In [None]:
dataset.to_csv("../dataset/data_modelling/data_stopwords.csv", index=False)

In [None]:
for index, text in enumerate(dataset['Stopwords'][100:110]):
  print('Review %d:\n'%(index+1),text)

In [None]:
# Character Length

dataset['Char_Length_Prep'] = dataset.Stopwords.apply(lambda x: len(str(x)))
dataset.head()

In [None]:
dataset.describe()

In [None]:
# min char length prep

data_min_char_length_prep = dataset[dataset.Char_Length_Prep == 0]
data_min_char_length_prep

In [None]:
index = data_min_char_length_prep.index
dataset.drop(labels=index, inplace=True)

In [None]:
dataset.describe()

In [None]:
dataset[dataset.Char_Length_Prep == 2]

In [None]:
dataset.isna().sum()

In [None]:
dataset

In [None]:
# save dataset

dataset.to_csv("../dataset/data_modelling/data_preprocessing.csv", index=False)