# File based

## 1. Import Library

In [1]:
import pandas as pd
import re

## 2. Load Data

### 2.1 File Based Data

In [9]:
filename = input("Filename: ") # Example: data.csv
data = pd.read_csv(filename, encoding='latin-1')
data.head(50)

Filename: data.csv


Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0
5,USER Ya bani taplak dkk \xf0\x9f\x98\x84\xf0\x...,1,1,0,1,0,0,0,0,1,0,1,0
6,deklarasi pilkada 2018 aman dan anti hoax warg...,0,0,0,0,0,0,0,0,0,0,0,0
7,Gue baru aja kelar re-watch Aldnoah Zero!!! pa...,0,1,0,0,0,0,0,0,0,0,0,0
8,Nah admin belanja satu lagi port terbaik nak m...,0,0,0,0,0,0,0,0,0,0,0,0
9,USER Enak lg klo smbil ngewe',0,1,0,0,0,0,0,0,0,0,0,0


### 2.2 Kamus Alay

In [3]:
new_kamusalay = pd.read_csv('new_kamusalay.csv', encoding='latin-1', header=None)
new_kamusalay = new_kamusalay.rename(columns={0: 'Original', 
                                      1: 'Replacement'})
new_kamusalay.head(50)

Unnamed: 0,Original,Replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali
5,aamiin,amin
6,aamiinn,amin
7,aamin,amin
8,aammiin,amin
9,abis,habis


## 3. Data Cleaning

### 3.1 Function for text preprocessing

In [4]:
def lowercase(text):
    return text.lower() # Change text to lowercase

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) # Remove word other than a-z and 0-9
    return text

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('rt',' ',text) # Remove every retweet symbol
    text = re.sub('user',' ',text) # Remove every username
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    return text

new_kamusalay_map = dict(zip(new_kamusalay['Original'], new_kamusalay['Replacement'])) # Create dictionary based on original(Key):Replacement(Value)
def normalize_alay(text):
    return ' '.join([new_kamusalay_map[word] if word in new_kamusalay_map else word for word in text.split(' ')]) # If word is new_kamusalay_map use the replacement else don't change it 

print("remove_nonaplhanumeric: ", remove_nonaplhanumeric("Hello,,,,, duniaa!!"))
print("lowercase: ", lowercase("Halooo, duniaa!"))
print("remove_unnecessary_char: ", remove_unnecessary_char("Hehe\n\n RT USER USER apa kabs www.google.com\n  hehe"))
print("normalize_alay: ", normalize_alay("aamiin adek abis"))

remove_nonaplhanumeric:  Hello duniaa 
lowercase:  halooo, duniaa!
remove_unnecessary_char:  Hehe RT USER USER apa kabs hehe
normalize_alay:  amin adik habis


In [5]:
def preprocess(text):
    text = lowercase(text) # Change all letters into lowercase for easier process
    text = remove_nonaplhanumeric(text) # Remove non alphabet and numeric
    text = remove_unnecessary_char(text) # remove retweet symbol, username, URL, and extra spaces
    text = normalize_alay(text) # 3 Normalize alay word
    return text

In [6]:
data['Tweet'] = data['Tweet'].apply(preprocess) # Apply the function to the tweet
data.head(50)

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,di saat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,siapa yang telat memberi tau kamu edan sarap ...,0,1,0,0,0,0,0,0,0,0,0,0
2,41 kadang aku berpikir kenapa aku tetap percay...,0,0,0,0,0,0,0,0,0,0,0,0
3,aku itu aku dan ku tau matamu sipit tapi dili...,0,0,0,0,0,0,0,0,0,0,0,0
4,kaum cebong kafir sudah kelihatan dongoknya d...,1,1,0,1,1,0,0,0,0,0,1,0
5,ya bani taplak dan kawan kawan xf0 x9f x98 x8...,1,1,0,1,0,0,0,0,1,0,1,0
6,deklarasi pilihan kepala daerah 2018 aman dan ...,0,0,0,0,0,0,0,0,0,0,0,0
7,gue baru saja selesai re watch aldnoah zero pa...,0,1,0,0,0,0,0,0,0,0,0,0
8,nah admin belanja satu lagi po terbaik nak mak...,0,0,0,0,0,0,0,0,0,0,0,0
9,enak lagi kalau sambil ngewe,0,1,0,0,0,0,0,0,0,0,0,0


## 4. Save Data

In [7]:
data.to_csv('cleaned_data.csv', index=False) # Save as new csv