In [17]:
import pandas as pd
import re
import os
import ast

# === CONFIG ===
current_dir      = os.path.basename(os.path.dirname(os.path.abspath("*")))
TWEETS_FILE      = os.path.join('..','..','data','tweets1523.csv')
CLASSIFIED_FILE  = f"{current_dir}_classified_tweets.csv"
LOG_FILE         = f"{current_dir}_op.out"

# Column names in classified file
TRUE_COL         = 'true_label'
PRED_COL         = 'classification'
CONF_COL         = 'confidence'
REASON_COL       = 'reasoning'
HIST_COL         = 'conversation'
ORIG_TWEET_COL   = 'original_tweet'

# Column names in tweets1523 file
TWEET_TEXT       = 'text'
TRUE_TWEET_LABEL = 'target'

# === LOAD DATA ===
tweets_df     = pd.read_csv(TWEETS_FILE)
classified_df = pd.read_csv(CLASSIFIED_FILE)

# === 1) Misclassified Tweets ===
mis = classified_df.loc[classified_df[TRUE_COL] != classified_df[PRED_COL]].copy()

mis_output = mis.rename(columns={
    ORIG_TWEET_COL: 'Original full Tweet',
    TRUE_COL      : 'True label',
    PRED_COL      : 'Predicted label',
    CONF_COL      : 'Confidence level',
    REASON_COL    : 'Reasoning',
    HIST_COL      : 'Conversation history'
})[[
    'Original full Tweet',
    'True label',
    'Predicted label',
    'Confidence level',
    'Reasoning',
    'Conversation history'
]]

mis_output.to_csv(f"{current_dir}_misclassified_tweets.csv", index=False)
print(f"→ misclassified_tweets.csv ({len(mis_output)} rows)")

# === 2) Policy-Violation Errors ===
error_records = []
with open(LOG_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()

for i, line in enumerate(lines):
    if line.startswith("Error processing tweet:"):
        # Extract up to 200 chars of the snippet
        m = re.match(r'Error processing tweet:\s*"?(?P<snippet>.+?)"?\s*$', line)
        if not m:
            continue
        raw_snippet = m.group('snippet').strip()
        snippet = raw_snippet[:200].rstrip(' .')  # trim trailing dots/spaces

        # Next line holds the JSON-like error detail
        if i + 1 >= len(lines):
            continue
        if not lines[i + 1].startswith("Error"):
            # print("linebreak found in tweet....",i,lines[i + 1])
            i=i+1
            # print(lines[i + 1])
        err_line = lines[i + 1]

        # Parse out the content_filter_result dict
        start = err_line.find("'content_filter_result':")
        reasons = []
        severities = []
        if start != -1:
            # Find matching braces to extract the dict substring
            brace_start = err_line.find('{', start)
            count = 1
            j = brace_start + 1
            while j < len(err_line) and count > 0:
                if err_line[j] == '{':
                    count += 1
                elif err_line[j] == '}':
                    count -= 1
                j += 1
            content_str = err_line[brace_start:j]
            try:
                content_dict = ast.literal_eval(content_str)
                for category, info in content_dict.items():
                    if info.get('filtered'):
                        reasons.append(category)
                        severities.append(info.get('severity', ''))
            except Exception:
                # parsing failed; skip
                pass

        # Find the full original tweet and true label by prefix match
        mask = tweets_df[TWEET_TEXT].str.startswith(snippet, na=False)
        if mask.any():
            full_text  = tweets_df.loc[mask, TWEET_TEXT].iloc[0]
            true_class = tweets_df.loc[mask, TRUE_TWEET_LABEL].iloc[0]
        else:
            full_text  = raw_snippet
            true_class = None

        error_records.append({
            'Original tweet'    : full_text,
            'True class'        : true_class,
            'Violation reasons' : ', '.join(reasons),
            'Severity label'    : ', '.join(severities)
        })

error_df = pd.DataFrame(error_records, columns=[
    'Original tweet',
    'True class',
    'Violation reasons',
    'Severity label'
])

error_df.to_csv(f"{current_dir}_error_tweets.csv", index=False)
print(f"→ error_tweets.csv ({len(error_df)} rows)")


→ misclassified_tweets.csv (300 rows)
→ error_tweets.csv (111 rows)


In [1]:
!cwd

/bin/bash: line 1: cwd: command not found


In [2]:
!pwd

/home/jovyan/Desktop/Capstone/TweetClassification/Final_Azure_Experiments/AzureAI_4o-mini
