In [1]:
import re

# How to split a string separated by a regex?

In [2]:
text = """101 COM   Computers
205 MAT   Mathematics
189 ENG   English""" 

text = re.split('\s+', text)
print(text)

['101', 'COM', 'Computers', '205', 'MAT', 'Mathematics', '189', 'ENG', 'English']


In [3]:
text

['101',
 'COM',
 'Computers',
 '205',
 'MAT',
 'Mathematics',
 '189',
 'ENG',
 'English']

In [4]:
print(text)

['101', 'COM', 'Computers', '205', 'MAT', 'Mathematics', '189', 'ENG', 'English']


# Finding pattern matches using findall, search and match

**re.findall()**

In [5]:
text = """101 COM    Computers
205 MAT   Mathematics
189 ENG   English""" 

regex_num = re.compile('\d+')
regex_num.findall(text)

['101', '205', '189']

In [6]:
regex_num1 = re.findall('[0-9]+',text)
regex_num1

['101', '205', '189']

**re.search()**

In [7]:
# define the text
text2 = """205 COM    Computers MAT   Mathematics 189"""

# compile the regex and search the pattern
regex_num = re.compile('\d+')
s = regex_num.search(text2)
# print(s)

print('Starting Position: ', s.start())
print('Ending Position: ', s.end())

Starting Position:  0
Ending Position:  3


# How to substitute one text with another using regex?

In [8]:
text = """101   COM \t  Computers
205   MAT \t  Mathematics
189   ENG  \t  English"""
print(text)
print("=========================")
text = re.sub('\t', '_', text)
print(text)

101   COM 	  Computers
205   MAT 	  Mathematics
189   ENG  	  English
101   COM _  Computers
205   MAT _  Mathematics
189   ENG  _  English


In [9]:
text = "moch ari n"
print(text)
print('=========================')

print(re.sub('\s+', '__', text))

moch ari n
moch__ari__n


# Regex groups

In [10]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""  

**Extract all course numbers**

In [11]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""  

print(re.findall('[0-9]+', text))

['101', '205', '189']


**Extract all course codes**

In [12]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""

print(re.findall('[A-Z]{3}', text))

['COM', 'MAT', 'ENG']


**Extract all course names**

In [13]:
text = """101   COM   Computers
205   MAT   Mathematics
189   ENG    English"""

print(re.findall('[A-Za-z]{4,}', text))

['Computers', 'Mathematics', 'English']


# Regex for text cleansing

In [14]:
text = """
To be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""

text

'\nTo be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.\n\nYou see, problems like this can easily be solved by proper tech. That is NOT the problem.\n\nThe problem is everything but tech?.\n\nhttps://lalalaa.com\n'

**Remove URL**

In [15]:
text = """
To be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""
text = re.sub('http\S+', '', text)
text

'\nTo be fair, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.\n\nYou see, problems like this can easily be solved by proper tech. That is NOT the problem.\n\nThe problem is everything but tech?.\n\n\n'

**Menghapus emoticon dan tanda baca**

In [16]:
text = """
To be fair 1000, the app works to an extent. I can register and get my payment code etc, works "well" until the status takes days to update.

You see, problems like this can easily be solved by proper tech. That is NOT the problem.

The problem is everything but tech?.

https://lalalaa.com
"""
text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
text

' To be fair 1000  the app works to an extent  I can register and get my payment code etc  works  well  until the status takes days to update   You see  problems like this can easily be solved by proper tech  That is NOT the problem   The problem is everything but tech    https   lalalaa com '

In [17]:
text = text.strip()
text

'To be fair 1000  the app works to an extent  I can register and get my payment code etc  works  well  until the status takes days to update   You see  problems like this can easily be solved by proper tech  That is NOT the problem   The problem is everything but tech    https   lalalaa com'

In [18]:
import pandas as pd

In [19]:
data = pd.read_csv('Latihan Cleansing.csv')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Latihan Cleansing.csv'

In [None]:
# Selanjutnya kita akan memulai cleansing (pembersihan) pada kolom "text" dari DataFrame.
# Untuk melakukan cleansing, terlebih dahulu kita perlu mendefinisikan sebuah fungsi.
# Fungsi ini diberi nama "cleansing"

# Terlebih dahulu kita import library bernama "RegEx".
# "RegEx" merupakan library yang digunakan untuk memanipulasi data text berdasarkan pola text-nya
# Kita panggil library "RegEx" sebagai berikut:
import re

# Selanjutnya berbagai jenis cleansing yang digunakan sebagai berikut
def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = str(sent).lower()
    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'[^a-zA-Z0-9]', ' ', string)
    return string

In [None]:
data.columns

Index(['Id', 'Sentiment', 'Acara TV', 'Jumlah Retweet', 'Text Tweet',
       'text_clean'],
      dtype='object')

In [None]:
# Setelah selesai mendefinisikan fungsi "cleansing", selanjutnya kita aplikasikan ke dalam kolom text pada DataFrame
# Caranya menjalankan script di bawah
data['text_clean'] = data['Text Tweet'].apply(cleansing)

In [None]:
data.head()

Unnamed: 0,Id,Sentiment,Acara TV,Jumlah Retweet,Text Tweet,text_clean
0,1,positive,HitamPutihTransTV,12,"Undang @N_ShaniJKT48 ke hitamputih, pemenang S...",undang n shanijkt48 ke hitamputih pemenang s...
1,2,positive,HitamPutihTransTV,6,Selamat berbuka puasa Semoga amal ibadah hari ...,selamat berbuka puasa semoga amal ibadah hari ...
2,3,positive,HitamPutihTransTV,9,"Ada nih di trans7 hitam putih, dia dpt penghar...",ada nih di trans7 hitam putih dia dpt penghar...
3,4,positive,HitamPutihTransTV,2,selamat ya mas @adietaufan masuk hitamputih,selamat ya mas adietaufan masuk hitamputih
4,5,positive,HitamPutihTransTV,1,Asiknya nonton Hitam Putih Trans7,asiknya nonton hitam putih trans7
