In [40]:
import pandas as pd
import re

df_sampled = pd.read_csv("cleaned_taglish_tweets_for_annotation.csv")

tagalog_discourse_markers = [
    "at", "kung", "hanggang", "hangga’t", "bagama’t", "nang", "o", "kaya", "pero",
    "dahil sa", "dahilan sa", "gawa ng", "sapagka’t", "upang", "sakali", "noon",
    "sa sandali", "magbuhat", "magmula", "bagaman", "maliban", "bukod", "dangan",
    "dahil", "yayamang", "kapag", "pagka", "tuwing", "matapos", "pagkatapos",
    "porke", "maski", "imbis", "sa lugar", "sa halip", "miyentras", "para",
    "saka", "haba", "samantala", "bago", "kundi"
]

english_discourse_markers = [
    "and", "but", "or", "so", "because", "although", "however", "nevertheless",
    "nonetheless", "yet", "still", "despite that", "in spite of that",
    "even so", "on the contrary", "on the other hand", "otherwise",
    "instead", "alternatively", "in contrast", "as a result", "therefore",
    "thus", "consequently", "hence", "so that", "in order that", "with the result that",
    "because of this", "due to this", "then", "next", "after that", "afterwards",
    "since then", "eventually", "finally", "in the end", "at first", "in the beginning",
    "to begin with", "first of all", "for one thing", "for another thing",
    "secondly", "thirdly", "to start with", "in conclusion", "to conclude",
    "to sum up", "in short", "in brief", "overall", "on the whole", "all in all",
    "to summarize", "in a nutshell", "moreover", "furthermore", "what is more",
    "in addition", "besides", "also", "too", "as well", "in the same way",
    "similarly", "likewise", "in other words", "that is to say", "this means that",
    "for example", "for instance", "such as", "namely", "in particular", "especially",
    "more precisely", "to illustrate", "as a matter of fact", "actually", "in fact",
    "indeed", "clearly", "surely", "certainly", "obviously", "of course", "naturally",
    "apparently", "evidently", "no doubt", "undoubtedly", "presumably",
    "frankly", "honestly", "to be honest", "luckily", "fortunately", "unfortunately",
    "hopefully", "interestingly", "surprisingly", "ironically"
]

all_markers = sorted(tagalog_discourse_markers + english_discourse_markers, key=lambda x: -len(x))
escaped_markers = [re.escape(marker) for marker in all_markers]
pattern = r'\b(?:' + '|'.join(escaped_markers) + r')\b'

print(df_sampled.columns)

Index(['Tweet Number', 'Tweet', 'Sentiment'], dtype='object')


In [41]:
def split_into_clauses_clean(text):
    text = str(text).strip()

    if not text:
        return []
    tokens = re.split(pattern, text, flags=re.IGNORECASE)
    markers = re.findall(pattern, text, flags=re.IGNORECASE)

    clauses = []

    if markers and len(tokens) > 1:
        for i in range(len(tokens)):
            segment = tokens[i].strip()
            if not segment:
                continue

            if i == 0:
                clauses.append(segment)
            elif i - 1 < len(markers):
                marker = markers[i - 1].strip()
                combined = marker + " " + segment

                if len(segment.split()) <= 2:
                    clauses[-1] += " " + combined
                else:
                    clauses.append(combined)
            else:
                clauses.append(segment)
    else:
        fallback_tokens = re.split(r"[.!?;:]", text)
        clauses = [seg.strip() for seg in fallback_tokens if seg.strip()]

    return clauses


In [46]:
rows = []

for i, row in df_sampled.iterrows():
    tweet_id = row["Tweet Number"]
    text = row["Tweet"]
    try:
        clauses = [c.strip() for c in split_into_clauses_clean(text) if c.strip()]
    except Exception as e:
        print(f"Error at tweet {tweet_id}: {e}")
        continue

    for j, clause in enumerate(clauses, start=1):
        rows.append({
            "Tweet Number": tweet_id,
            "Clause Number": j,
            "Clause": clause,
            "Sentiment": "" 
        })

clause_df = pd.DataFrame(rows, columns=["Tweet Number", "Clause Number", "Clause", "Sentiment"])
clause_df.to_csv("clauses_taglish.csv", index=False)
clause_df.head()


Error at tweet 63: list index out of range
Error at tweet 110: list index out of range
Error at tweet 1137: list index out of range
Error at tweet 1233: list index out of range
Error at tweet 1398: list index out of range
Error at tweet 2890: list index out of range
Error at tweet 2898: list index out of range
Error at tweet 3142: list index out of range
Error at tweet 3361: list index out of range
Error at tweet 3389: list index out of range
Error at tweet 4780: list index out of range
Error at tweet 6455: list index out of range
Error at tweet 6600: list index out of range
Error at tweet 6864: list index out of range
Error at tweet 7156: list index out of range
Error at tweet 7587: list index out of range
Error at tweet 7833: list index out of range
Error at tweet 8453: list index out of range
Error at tweet 8560: list index out of range
Error at tweet 8616: list index out of range
Error at tweet 9736: list index out of range
Error at tweet 11212: list index out of range
Error at twe

Unnamed: 0,Tweet Number,Clause Number,Clause,Sentiment
0,1,1,wag naman sana,
1,1,2,at maapektuhan kaming tuwid bumoto,
2,2,1,kung sino man yang 12k na bumoto kay quiboloy ...,
3,3,1,na dinaya ang botohan kaloka,
4,4,1,so far bam kiko luke amp heidi frontrunner sa ...,


In [47]:
df_sampled[df_sampled["Tweet Number"] == 19766]

Unnamed: 0,Tweet Number,Tweet,Sentiment
19765,19766,so ilang botante kaya ang bobotohin si pacman ...,
