# All the basic preprocessing in one place

#### Let's apply all the preprocessing methods we have discussed so far on our Zomato dataset and see how everything works together

Mahesa Yuztar (220535601516/TI-B 2022)@author: Aman Kedia

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [6]:
with open('testing_nlp.txt', 'r') as file:
    lines = file.readlines()

corpus = pd.DataFrame(lines, columns=['Text'])
print(corpus)

                                                  Text
0                   Karanganipun: Suwara (Sandinama)\n
1                                                   \n
2    Kaêcap ing pangêcapanipun Tuwan H.A. Benyamin,...
3                                                   \n
4                                     ing taun 1913.\n
..                                                 ...
305  [... NAWI] dados biyung sagêd anuntun anakipun...
306                                                 \n
307                Ing Batawi, kaping: 27 Juli 1912.\n
308                                                 \n
309                                            Suwara.

[310 rows x 1 columns]


In [7]:
corpus

Unnamed: 0,Text
0,Karanganipun: Suwara (Sandinama)\n
1,\n
2,"Kaêcap ing pangêcapanipun Tuwan H.A. Benyamin,..."
3,\n
4,ing taun 1913.\n
...,...
305,[... NAWI] dados biyung sagêd anuntun anakipun...
306,\n
307,"Ing Batawi, kaping: 27 Juli 1912.\n"
308,\n


### Text Cleaning (Removal of special characters/punctuations & case folding)

In [13]:
def text_clean(corpus, keep_list):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)

    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process

    Output : Returns the cleaned text corpus
    '''
    cleaned_list = []  # List untuk menyimpan hasil sementara

    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                # Menghapus semua karakter selain alfabet dan angka
                p1 = re.sub(pattern='[^a-zA-Z0-9]', repl=' ', string=word)
                p1 = p1.lower()
                qs.append(p1)
            else:
                qs.append(word)
        cleaned_list.append(' '.join(qs))  # Menyimpan hasil sebagai string gabungan

    cleaned_corpus = pd.Series(cleaned_list)  # Konversi daftar ke Series pada akhirnya
    return cleaned_corpus

In [14]:
text_clean(corpus['Text'], [])

Unnamed: 0,0
0,karanganipun suwara sandinama
1,
2,ka cap ing pang capanipun tuwan h a benyamin ...
3,
4,ing taun 1913
...,...
305,nawi dados biyung sag d anuntun anakipun...
306,
307,ing batawi kaping 27 juli 1912
308,


### Stopwords Removal

In [9]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

### Lemmatization

In [10]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [15]:
lemmatize(corpus['Text'])

[['K',
  'a',
  'r',
  'a',
  'n',
  'g',
  'a',
  'n',
  'i',
  'p',
  'u',
  'n',
  ':',
  ' ',
  'S',
  'u',
  'w',
  'a',
  'r',
  'a',
  ' ',
  '(',
  'S',
  'a',
  'n',
  'd',
  'i',
  'n',
  'a',
  'm',
  'a',
  ')',
  '\n'],
 ['\n'],
 ['K',
  'a',
  'ê',
  'c',
  'a',
  'p',
  ' ',
  'i',
  'n',
  'g',
  ' ',
  'p',
  'a',
  'n',
  'g',
  'ê',
  'c',
  'a',
  'p',
  'a',
  'n',
  'i',
  'p',
  'u',
  'n',
  ' ',
  'T',
  'u',
  'w',
  'a',
  'n',
  ' ',
  'H',
  '.',
  'A',
  '.',
  ' ',
  'B',
  'e',
  'n',
  'y',
  'a',
  'm',
  'i',
  'n',
  ',',
  ' ',
  'i',
  'n',
  'g',
  ' ',
  'n',
  'a',
  'g',
  'a',
  'r',
  'i',
  ' ',
  'S',
  'ê',
  'm',
  'a',
  'r',
  'a',
  'n',
  'g',
  ',',
  '\n'],
 ['\n'],
 ['i', 'n', 'g', ' ', 't', 'a', 'u', 'n', ' ', '1', '9', '1', '3', '.', '\n'],
 ['\n'],
 ['-', '-', '-', ' ', '[', '0', ']', ' ', '-', '-', '-', '\n'],
 ['\n'],
 ['S',
  'e',
  'r',
  'i',
  'e',
  ' ',
  'u',
  'i',
  't',
  'g',
  'a',
  'v',
  'e',
  'n',
  ' ',
  'd'

### Stemming

In [18]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [19]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)

    Input :
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer

    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together

    Output : Returns the processed text corpus

    '''

    if cleaning == True:
        corpus = text_clean(corpus, keep_list)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]

    return corpus

In [20]:
common_dot_words = []

In [26]:
# Preprocessing with Lemmatization here
corpus_with_lemmatization = preprocess(corpus['Text'], keep_list = common_dot_words, stemming = False, stem_type = None, lemmatization = True, remove_stopwords = True)

In [27]:
# Preprocessing with Stemming here here
corpus_with_stemming = preprocess(corpus['Text'], keep_list = common_dot_words, stemming = True, stem_type = "snowball", lemmatization = False, remove_stopwords = True)

# Mencoba preprocessing

In [28]:
corpus_with_lemmatization

['karanganipun suwara sandinama',
 '',
 'ka cap ing pang capanipun tuwan h benyamin ing nagari marang',
 '',
 'ing taun 1913',
 '',
 '0',
 '',
 'serie uitgaven door bemiddeling der commissie voor de volkslectuur 127',
 '',
 'bab alakirabi wayuh kalihan bot n',
 'karanganipun suwara sandiasma',
 '',
 'ka cap ing pang capanipun tuwan h benyamin ing nagari marang',
 '',
 'ing taun 1913',
 '',
 '0',
 '',
 'monogamie en polygamie',
 '',
 'door',
 '',
 'e r',
 '',
 'semarang drukkerij en boekhandel',
 '',
 'h benjamins semarang',
 '',
 '1913',
 '',
 '5',
 '',
 'bab alakirabi wayuh kalihan bot n',
 '',
 'adat tatacara ingkang kawastanan prayogi saking pamanggih kula ingkang tumusipun adam l sakeca tuwin mulya dhat ng ingkang gadhah adat lan tatacara wau manut ingkang katingal ing kathah b ngsa kulit p thak adat tuwin caranipun sae pathokanipun angug mi dhawuhing allah kados ta manusa k dah n dha kring tipun piyambak sint n ingkang bot n purun kumlawe inggih badhe kaluw n sanak sadh r k bot n 

In [29]:
corpus_with_stemming

['karanganipun suwara sandinama',
 '',
 'ka cap ing pang capanipun tuwan h benyamin ing nagari marang',
 '',
 'ing taun 1913',
 '',
 '0',
 '',
 'seri uitgaven door bemiddel der commissi voor de volkslectuur 127',
 '',
 'bab alakirabi wayuh kalihan bot n',
 'karanganipun suwara sandiasma',
 '',
 'ka cap ing pang capanipun tuwan h benyamin ing nagari marang',
 '',
 'ing taun 1913',
 '',
 '0',
 '',
 'monogami en polygami',
 '',
 'door',
 '',
 'e r',
 '',
 'semarang drukkerij en boekhandel',
 '',
 'h benjamin semarang',
 '',
 '1913',
 '',
 '5',
 '',
 'bab alakirabi wayuh kalihan bot n',
 '',
 'adat tatacara ingkang kawastanan prayogi sake pamanggih kula ingkang tumusipun adam l sakeca tuwin mulya dhat ng ingkang gadhah adat lan tatacara wau manut ingkang kating ing kathah b ngsa kulit p thak adat tuwin caranipun sae pathokanipun angug mi dhawuh allah kado ta manusa k dah n dha kring tipun piyambak sint n ingkang bot n purun kumlaw inggih badh kaluw n sanak sadh r k bot n badh purun nguluri