# Prepocessing Text PTA

## Instalasi

In [None]:
_= !pip install nltk
_= !pip install indoNLP

In [None]:
import pandas as pd #pandas
import numpy as np #numpy
import re #regex
import string #string population
from nltk.tokenize import word_tokenize #tokenize
from nltk.corpus import stopwords #stopword
from indoNLP.preprocessing import replace_slang #slank word
from nltk.stem.porter import PorterStemmer #stemming

## Read Dataset

In [None]:
df = pd.read_csv('/content/drive/Othercomputers/My Laptop/Materi Kuliah/Semester 6/PSD/Crawling Scraping/ptaLabel.csv')
df.rename(
    columns={"Abstrak": "abstrak", "Label": "label"},
    inplace=True,
)

In [None]:
df

Unnamed: 0,NPM,Judul,abstrak,Prodi,label
0,40411100468,PERANCANGAN DAN IMPLEMENTASI SISTEM DATABASE T...,Sistem informasi akademik (SIAKAD) merupaka...,Teknik Informatika,rpl
1,40411100476,APLIKASI KONTROL DAN MONITORING JARINGAN KOMPU...,Berjalannya koneksi jaringan komputer dengan l...,Teknik Informatika,rpl
2,40411100480,RANCANG BANGUN APLIKASI PROXY SERVER UNTUKENKR...,Web server adalah sebuah perangkat lunak serve...,Teknik Informatika,rpl
3,70411100070,SISTEM PENDUKUNG KEPUTUSAN OPTIMASI PENJADWALA...,Penjadwalan kuliah di Perguruan Tinggi me...,Teknik Informatika,
4,80411100115,SISTEM AUGMENTED REALITY ANIMASI BENDA BERGERA...,Seiring perkembangan teknologi yang ada diduni...,Teknik Informatika,komputasi
...,...,...,...,...,...
850,160411100032,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,Investasi saham selama ini memiliki resiko ker...,Teknik Informatika,komputasi
851,160411100182,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,Information Retrieval (IR) merupakan pengambil...,Teknik Informatika,pba
852,160411100077,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,Klasifikasi citra merupakan proses pengelompok...,Teknik Informatika,komputasi
853,160411100084,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,Identifikasi atribut pejalan kaki merupakan sa...,Teknik Informatika,komputasi


In [None]:
df['label'].value_counts()

komputasi      344
RPL            168
Komputasi       52
PBA             26
KOMPUTASI        2
KOmputasi        1
komputasi        1
komputai         1
klasifikasi      1
Name: label, dtype: int64

## Change Label

In [None]:
def change_class(before, after):
  df.loc[df['label'] == before, 'label'] = after

In [None]:
change_class('RPL', 'rpl')
change_class('Komputasi', 'komputasi')
change_class('PBA', 'pba')
change_class('KOMPUTASI', 'komputasi')
change_class('KOmputasi', 'komputasi')
change_class('komputasi ', 'komputasi')
change_class('komputai', 'komputasi')
change_class('klasifikasi', 'komputasi')

In [None]:
df['label'].value_counts()

komputasi    402
rpl          168
pba           26
Name: label, dtype: int64

## Remove dataset yang NAN

In [None]:
df.isna().sum()

NPM          0
Judul        6
abstrak     26
Prodi        0
label      259
dtype: int64

In [None]:
df = df[['abstrak', 'label']].dropna().reset_index(drop=True)

In [None]:
df.isna().sum()

abstrak    0
label      0
dtype: int64

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Pre-Processing

In [None]:
class Prepocessing:
  def __init__(self):
    self.listStopword =  set(stopwords.words('indonesian'))
    self.stemmer = PorterStemmer()

  def remove_emoji(self, string): #remove emoji
    emoji_pattern = re.compile("["
      u"\U0001F600-\U0001F64F"  # emoticons
      u"\U0001F300-\U0001F5FF"  # symbols & pictographs
      u"\U0001F680-\U0001F6FF"  # transport & map symbols
      u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
      u"\U00002702-\U000027B0"
      u"\U000024C2-\U0001F251"
      "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', string)

  def remove_unwanted(self, document): #clean text
    # remove user mentions
    document = re.sub("@[A-Za-z0-9_]+"," ", document)
    # remove URLS 
    document = re.sub(r'http\S+', ' ', document)
    # remove hashtags
    document = re.sub("#[A-Za-z0-9_]+","", document)
    # remove emoji's
    document = self.remove_emoji(document)
    # remove punctuation
    document = re.sub("[^0-9A-Za-z ]", "" , document)
    # remove double spaces
    document = document.replace('  '," ")
    return document.strip()
  
  def tokenize(self, text): #tokenize -> memisah kalimat 
    return word_tokenize(text.translate(str.maketrans('', '', string.punctuation)).lower())
  
  def stopWord(self, text): #stopword -> menghapus kata hubung
    return [kata for kata in text if kata not in self.listStopword]
  
  def slank_word(self, text): #slank word -> mengganti kata yang tidak baku
    return [replace_slang(kata) for kata in text]

  def stemming(self, text): #stemming -> mengganti kata menjadi kata dasar
    return " ".join([self.stemmer.stem(kata) for kata in text])

In [None]:
preprocessing = Prepocessing()

## Clean Text

In [None]:
df['clean'] = df['abstrak'].apply(lambda x: preprocessing.remove_unwanted(x))

## Tokenize

In [None]:
df['tokenize'] = df['clean'].apply(lambda x: preprocessing.tokenize(x))

## stopword

In [None]:
df['stopword'] = df['tokenize'].apply(lambda x: preprocessing.stopWord(x))

## Slank Word

In [None]:
df['slankword'] = df['stopword'].apply(lambda x: preprocessing.slank_word(x))

## Stemming

In [None]:
df['stem'] = df['slankword'].apply(lambda x: preprocessing.stemming(x))

In [None]:
df[['abstrak', 'clean', 'tokenize', 'stopword', 'slankword', 'stem', 'label']]

In [None]:
df_final = df[['stem', 'label']]

In [None]:
df_final.to_csv('pta_final.csv', index=False)