<a href="https://colab.research.google.com/github/devinbook/Tagalog-profanity-detection/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import re

In [4]:
df = pd.read_csv('/content/tagalog_profanity_data.csv')

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df.head(10)

Unnamed: 0,Text,Label
0,"sa poor social skills ko, di ako mahilig lumab...",0
1,gago mas nakakagana sana magtrabaho kung malam...,1
2,Kupal Tricycle Drivers With the recent issues ...,1
3,narating sa buhay? Wala namang trabaho. Pabiga...,1
4,sarili mong mga anak. hays nakakapagod maging ...,1
5,na siya sa lugar na 'yon. Sana po mapost ito. ...,0
6,"sa sama ng loob. kasama pa non, feeling ko ang...",1
7,MAGHIWALAY NA TAYONG LAHAT PUCHA!! Magbreak na...,1
8,KABOBOHAN INUTIL TANGA hehe sorry sa mura pero...,1
9,gago tawang tawa ko sino may video ng speech n...,1


#Data Cleaning

In [7]:
# Update the data cleaning function
def data_cleaning(df):
    # Remove duplicates
    df = df.drop_duplicates()

    # Handle missing values
    df = df.dropna()  # Modify based on your requirements

    # Normalize text: convert to lowercase
    df['Text'] = df['Text'].str.lower()

    # Remove special characters and punctuation
    df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

    # Remove stopwords
    tagalog_stopwords = [
        "ang", "mga", "si", "kay", "sa", "ni", "ng", "at", "pero", "dahil",
        "kung", "kapag", "ay", "na", "po", "kasi", "naman", "nga", "eh",
        "dapat", "ito", "iyan", "iyon", "kami", "tayo", "sila", "ako",
        "ikaw", "kayo", "kanila", "amin", "atin", "natin", "niya",
        "nila", "mo", "ko", "pa", "ba", "rin", "din", "nang", "lamang",
        "lang", "kahit", "saan", "bakit", "ganito", "ganyan", "ganoon",
        "paano", "lahat", "halos", "may", "wala", "hindi", "oo", "huwag",
        "ayaw", "kaya", "mula", "sa", "ngunit", "o", "daw", "raw",
        "pati", "saka", "isang", "bawat", "madalas", "samantalang",
        "basta", "parang", "labis", "sobrang"
    ]
    df['Text'] = df['Text'].apply(lambda x: ' '.join(word for word in x.split() if word not in tagalog_stopwords))

    return df

df = data_cleaning(df)

In [8]:
df.shape

(20051, 2)

In [9]:
df.head()

Unnamed: 0,Text,Label
0,poor social skills di mahilig lumabas bahay pa...,0
1,gago mas nakakagana sana magtrabaho malamig la...,1
2,kupal tricycle drivers with the recent issues ...,1
3,narating buhay namang trabaho pabigat bahay ya...,1
4,sarili mong anak hays nakakapagod maging worki...,1


#Data Augmentation

In [10]:
!pip install googletrans==3.1.0a0
import random
from googletrans import Translator

# Translator for back translation
translator = Translator()



In [11]:
# Define profanity words and variations for augmentation
profanity_words = {
    "gago": ["g*go", "g@g0", "g4go"],
    "tanga": ["t*nga", "t@nga", "t4nga"],
    "bobo": ["b*bo", "b0bo", "b@bo"],
    "puta": ["p*ta", "p@ta", "put@", "p*t@", "put4"],
    "ulol": ["ul*l", "u1ol", "ul0l"],
    "bwisit": ["bwis*t", "bwis@t", "bw1sit"],
    "tangina": ["t@ngina", "t4ngina", "t*ngina"],
    "pakyu": ["p@kyu", "p4kyu", "p*kyu"],
    "hayop": ["hay0p", "h@yop", "h4yop"],
    "tarantado": ["t@rantado", "t4rantado", "tarant@do"],
    "putangina": ["p*tangina", "put@ngina", "put@ng1n@", "p*t4ngina"],
    "yawa": ["y@wa", "y4wa", "y*w@"]
}

# Function to replace profanity words with variations
def augment_with_profanity_variation(text):
    for word, variations in profanity_words.items():
        if word in text:
            variation = random.choice(variations)
            text = text.replace(word, variation)
    return text

# Apply augmentation only to profane sentences in the dataset
df['Augmented_Text'] = df.apply(lambda x: augment_with_profanity_variation(x['Text']) if x['Label'] == 1 else x['Text'], axis=1)

# Display sample augmented text
df[['Text', 'Augmented_Text', 'Label']].head(10)

Unnamed: 0,Text,Augmented_Text,Label
0,poor social skills di mahilig lumabas bahay pa...,poor social skills di mahilig lumabas bahay pa...,0
1,gago mas nakakagana sana magtrabaho malamig la...,g*go mas nakakagana sana magtrabaho malamig la...,1
2,kupal tricycle drivers with the recent issues ...,kupal tricycle drivers with the recent issues ...,1
3,narating buhay namang trabaho pabigat bahay ya...,narating buhay namang trabaho pabigat bahay y@...,1
4,sarili mong anak hays nakakapagod maging worki...,sarili mong anak hays nakakapagod maging worki...,1
5,siya lugar yon sana mapost salamat,siya lugar yon sana mapost salamat,0
6,sama loob kasama non feeling tanga tanga pano ...,sama loob kasama non feeling t4nga t4nga pano ...,1
7,maghiwalay tayong pucha magbreak dun yun papunta,maghiwalay tayong pucha magbreak dun yun papunta,1
8,kabobohan inutil tanga hehe sorry mura tangina...,kab*bohan inutil t@nga hehe sorry mura t*ngina...,1
9,gago tawang tawa sino video speech inday,g@g0 tawang tawa sino video speech inday,1


In [12]:
def synonym_replacement(text):
    # Dictionary of synonyms for each profanity word
    synonyms = {
        "gago": ["baliw", "sira", "loko", "tanga"],
        "tanga": ["engot", "bobo", "ulol", "hangal"],
        "bobo": ["tanga", "walang alam", "ulol", "engot"],
        "puta": ["malaswa", "kalapating mababa ang lipad", "masama"],
        "ulol": ["sira ulo", "loko", "tanga"],
        "bwisit": ["inip", "naiinis", "inutil"],
        "tangina": ["peste", "demonyo"],
        "pakyu": ["malas", "nagmumura", "wala kang kwenta"],
        "hayop": ["mababang uri", "masama", "malupit"],
        "tarantado": ["walang modo", "bastos", "pasaway"],
        "putangina": ["napakasama", "demonyo ka", "ang sama mo"],
        "yawa": ["demonyo", "malas", "masamang espiritu"]
    }

    # Replace words with randomly chosen synonyms
    for word, synonym_list in synonyms.items():
        if word in text:
            text = text.replace(word, random.choice(synonym_list))
    return text

In [13]:
import random
def random_insertion(text):
    words = text.split()
    num_insertions = random.randint(1, 2)
    for _ in range(num_insertions):
        insert_word = random.choice(contextual_words)
        insert_pos = random.randint(0, len(words) - 1)
        words.insert(insert_pos, insert_word)
    return " ".join(words)


In [14]:
# Function to simulate typos by introducing random character substitutions
def typo_simulation(text):
    text = list(text)
    num_typos = random.randint(1, 3)
    for _ in range(num_typos):
        pos = random.randint(0, len(text) - 1)
        text[pos] = random.choice("abcdefghijklmnopqrstuvwxyz*")
    return "".join(text)

In [15]:
def back_translation(text, src_lang="tl", tgt_lang="en"):
    # Translate to target language (English) and back to source language (Tagalog)
    translated = translator.translate(text, src=tgt_lang, dest=src_lang).text
    return translated

In [16]:
def augment_text(text):
    # Randomly apply different augmentations to create diverse samples
    if random.random() < 0.3:
        text = synonym_replacement(text)
    if random.random() < 0.5:
        text = typo_simulation(text)
    if random.random() < 0.3:
        text = random_insertion(text)
    if random.random() < 0.4:
        text = augment_with_profanity_variation(text)
    if random.random() < 0.2:
        text = back_translation(text)
    return text

contextual_words = ["po", "ba", "kasi", "naman", "eh", "nga"]

# Apply augmentation only to profane sentences in the dataset
df['Augmented_Text'] = df.apply(lambda x: augment_text(x['Text']) if x['Label'] == 1 else x['Text'], axis=1)

In [17]:
df.head(10)

Unnamed: 0,Text,Label,Augmented_Text
0,poor social skills di mahilig lumabas bahay pa...,0,poor social skills di mahilig lumabas bahay pa...
1,gago mas nakakagana sana magtrabaho malamig la...,1,baliw mas nakakpgana sana magtrabaho malamig l...
2,kupal tricycle drivers with the recent issues ...,1,kupal eh tricycle drivers with the recent issu...
3,narating buhay namang trabaho pabigat bahay ya...,1,narating buhay namang trabaho pabifat bahay y4...
4,sarili mong anak hays nakakapagod maging worki...,1,savili mong nga anak hays nakakaqagod maging w...
5,siya lugar yon sana mapost salamat,0,siya lugar yon sana mapost salamat
6,sama loob kasama non feeling tanga tanga pano ...,1,sama loob kasama non feeling tanga tanga pano ...
7,maghiwalay tayong pucha magbreak dun yun papunta,1,maghiwalay tayong pucha magbreet dun yun papunta
8,kabobohan inutil tanga hehe sorry mura tangina...,1,kab@bohan inutil t@nga hehe sorry mura t4ngina...
9,gago tawang tawa sino video speech inday,1,g@g0 tawang taws sino video speelh inday


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20051 entries, 0 to 20050
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Text            20051 non-null  object
 1   Label           20051 non-null  int64 
 2   Augmented_Text  20051 non-null  object
dtypes: int64(1), object(2)
memory usage: 470.1+ KB


In [21]:
df_expanded = pd.concat([df[['Text', 'Label']], df[['Augmented_Text', 'Label']].rename(columns={'Augmented_Text': 'Text'})], ignore_index=True)

In [24]:
df_expanded.shape

(28106, 2)

In [23]:
df_expanded = df_expanded.drop_duplicates(subset='Text').reset_index(drop=True)

In [25]:
contextual_words = ["po", "ba", "kasi", "naman", "nga", "eh", "sana"]

# Augmentation function for non-profane sentences
def augment_non_profane_text(text):
    # Randomly apply different augmentations to create variety
    if random.random() < 0.4:
        # Insert a random contextual word
        words = text.split()
        insert_word = random.choice(contextual_words)
        insert_position = random.randint(0, len(words))
        words.insert(insert_position, insert_word)
        text = " ".join(words)
    if random.random() < 0.3:
        # Simulate a typo (e.g., "kamusta" to "kumusta")
        text = text.replace("s", "z", 1) if "s" in text else text
    if random.random() < 0.3:
        # Replace a word with a synonym or similar word
        synonyms = {
    "kamusta": ["kumusta", "kamustahan", "ano na", "how are you"],
    "salamat": ["thanks", "maraming salamat", "salamat po", "thank you very much"],
    "oo": ["opo", "sige", "oo nga", "yes", "ay oo"],
    "hindi": ["di", "hindi nga", "no", "wala"],
    "paalam": ["goodbye", "saan ka pupunta", "take care"],
    "mabuti": ["okay", "maayos", "fine", "well"],
    "bati": ["greet", "salubong", "acknowledge"],
    "pakiusap": ["please", "mangyaring", "kindly", "request"],
    "gusto": ["want", "nais", "like", "desire"],
    "tulong": ["help", "assist", "support", "aid"],
    "mabilis": ["fast", "quick", "swifter", "rapido"],
    "maligayang": ["happy", "joyful", "merry", "cheerful"],
}
        for word, synonym_list in synonyms.items():
            if word in text:
                text = text.replace(word, random.choice(synonym_list))
    return text

In [26]:
non_profane_data = df_expanded[df_expanded['Label'] == 0]
augmented_non_profane = non_profane_data.copy()

In [27]:
while len(augmented_non_profane) < len(df_expanded[df_expanded['Label'] == 1]):
    new_augmented = non_profane_data['Text'].apply(augment_non_profane_text)
    augmented_non_profane = pd.concat([augmented_non_profane, new_augmented.to_frame(name='Text')])

# Add the labels back and limit to match the exact count of profane samples
augmented_non_profane = augmented_non_profane[:len(df_expanded[df_expanded['Label'] == 1])]
augmented_non_profane['Label'] = 0


In [28]:
df_balanced = pd.concat([df_expanded[df_expanded['Label'] == 1], augmented_non_profane], ignore_index=True)

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
print("Balanced dataset:\n", df_balanced['Label'].value_counts())

Balanced dataset:
 Label
1    18392
0    18392
Name: count, dtype: int64


In [29]:
df_balanced.head(10)

Unnamed: 0,Text,Label
0,ulol hago tanginamo true,1
1,simulang bagong pilipinas naway tulungan diyos...,0
2,overthinking and it is super draining lifes so...,0
3,son give them bread yung pinagbigyan is tinapo...,0
4,kasi malakidg putangina surusuportapumapanig s...,1
5,pwede mag relax bahayyyyy bahay yun ano gusto ...,0
6,nag private yung course gastos putangina archi...,1
7,and di sya replyan after a day puno chats nya ...,0
8,siya nag iisa alam laruin laro pagkakaiba man ...,0
9,naging okay yung daloy relationship namin sumu...,0


#Data Feature Extraction

In [31]:
# Text length
df_balanced['text_length'] = df_balanced['Text'].apply(len)

# Word count
df_balanced['word_count'] = df_balanced['Text'].apply(lambda x: len(x.split()))

# Average word length
# Check if the word count is greater than 0 to avoid ZeroDivisionError
df_balanced['avg_word_length'] = df_balanced['Text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if len(x.split()) > 0 else 0)

In [32]:
profanity_words = [
    "gago", "tanga", "bobo", "puta", "ulol", "bwisit",
    "tangina", "pakyu", "hayop", "tarantado", "putangina", "yawa"
]

# Count of profanity words
df_balanced['profanity_word_count'] = df_balanced['Text'].apply(
    lambda x: sum(word in x.lower() for word in profanity_words)
)

# Presence of profanity (binary feature)
df_balanced['contains_profanity'] = df_balanced['profanity_word_count'].apply(lambda x: 1 if x > 0 else 0)

In [33]:
from textblob import TextBlob

# Calculate sentiment polarity
df_balanced['sentiment_polarity'] = df_balanced['Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [34]:
df_balanced['special_char_count'] = df_balanced['Text'].apply(lambda x: sum(not c.isalnum() for c in x))

# Non-alphanumeric ratio
df_balanced['non_alphanumeric_ratio'] = df_balanced['special_char_count'] / df_balanced['text_length']

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=500)  # Limit features if dataset is large
tfidf_features = tfidf_vectorizer.fit_transform(df_balanced['Text']).toarray()

# Append TF-IDF features to the DataFrame
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf_vectorizer.get_feature_names_out())
df_balanced = pd.concat([df_balanced, tfidf_df], axis=1)

In [37]:
df_balanced.head(10)

Unnamed: 0,Text,Label,text_length,word_count,avg_word_length,profanity_word_count,contains_profanity,sentiment_polarity,special_char_count,non_alphanumeric_ratio,...,yan,yang,yawa,year,years,yon,you,your,yun,yung
0,ulol hago tanginamo true,1,24,4,5.25,2,1,0.35,3,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,simulang bagong pilipinas naway tulungan diyos...,0,76,8,8.625,0,0,0.0,7,0.092105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,overthinking and it is super draining lifes so...,0,119,20,5.0,0,0,0.333333,19,0.159664,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,son give them bread yung pinagbigyan is tinapo...,0,107,17,5.352941,0,0,-0.8,16,0.149533,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.425674
4,kasi malakidg putangina surusuportapumapanig s...,1,52,5,9.6,3,1,0.0,4,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,pwede mag relax bahayyyyy bahay yun ano gusto ...,0,67,13,4.230769,0,0,0.0,12,0.179104,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374035,0.0
6,nag private yung course gastos putangina archi...,1,53,7,6.714286,3,1,0.0,6,0.113208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.413526
7,and di sya replyan after a day puno chats nya ...,0,76,16,3.8125,0,0,0.0,15,0.197368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,siya nag iisa alam laruin laro pagkakaiba man ...,0,76,14,4.5,0,0,0.0,13,0.171053,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,naging okay yung daloy relationship namin sumu...,0,101,14,6.285714,0,0,0.5,13,0.128713,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187663




In [41]:
df_balanced.shape

(36784, 510)

In [44]:
df_balanced.to_csv('tagalog_profanity_feature_extracted.csv', index=False)
from google.colab import files
files.download('tagalog_profanity_feature_extracted.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>