In [None]:
import pandas as pd
import requests
import json
import time
from tqdm import tqdm
from sklearn.metrics import f1_score

# === API Setup ===
API_KEY = 'REPLACE_WITH_KEY'
PERSPECTIVE_URL = f'https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={API_KEY}'

import pandas as pd

# Load full dataset
df_full = pd.read_csv('cyberbullying.csv')

# Filter each group
df_not_cyber = df_full[df_full['cyberbullying_type'] == 'not_cyberbullying'].sample(n=250, random_state=42)
df_cyber = df_full[df_full['cyberbullying_type'] != 'not_cyberbullying'].sample(n=250, random_state=42)

# Combine them
df = pd.concat([df_not_cyber, df_cyber]).reset_index(drop=True)

# === Function to get toxicity score with rate limiting ===
def get_toxicity_score(text):
    data = {
        'comment': {'text': text},
        'languages': ['en'],
        'requestedAttributes': {'TOXICITY': {}}
    }
    while True:
        try:
            response = requests.post(PERSPECTIVE_URL, data=json.dumps(data))
            response.raise_for_status()
            result = response.json()
            return result['attributeScores']['TOXICITY']['summaryScore']['value']
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429:
                print("Rate limit hit. Sleeping for 10 seconds...")
                time.sleep(10)
            else:
                print(f"API Error: {e}")
                return None
        finally:
            time.sleep(1)  # delay to prevent hitting the rate limit

# === Add toxicity score with tqdm progress bar ===
if 'toxicity' not in df.columns or df['toxicity'].isnull().any():
    tqdm.pandas(desc="Analyzing toxicity")
    df['toxicity'] = df['tweet_text'].progress_apply(get_toxicity_score)
else:
    print("Using existing toxicity scores from file.")

# === Label binary ground truth ===
df['is_cyberbullying'] = df['cyberbullying_type'].apply(lambda x: 0 if x == 'not_cyberbullying' else 1)

# === Find best threshold ===
best_threshold = 0
best_f1 = 0
thresholds = [i / 100 for i in range(10, 91)]

for t in thresholds:
    df['predicted'] = df['toxicity'].apply(lambda x: 1 if x is not None and x >= t else 0)
    score = f1_score(df['is_cyberbullying'], df['predicted'])
    if score > best_f1:
        best_f1 = score
        best_threshold = t

print(f"Best threshold: {best_threshold} with F1 Score: {best_f1:.4f}")
print(df[['tweet_text', 'toxicity', 'is_cyberbullying', 'predicted']])

# === Save final output ===
df.to_csv('cyberbullying_scored.csv', index=False)
print("Results saved to 'cyberbullying_jigsaw_scored.csv'")


Analyzing toxicity: 100%|██████████| 500/500 [11:08<00:00,  1.34s/it]

Best threshold: 0.18 with F1 Score: 0.7617
                                            tweet_text  toxicity  \
0    RT @goodfoodAU: A restaurant AND a cookbook fo...  0.010179   
1    @Qoloob4 @Vandaliser @sajid_fairooz @IsraeliRe...  0.761980   
2    @YouSoFancy oh MAN!! Not sure it gets better t...  0.047358   
3    The Bully at Work: What You Can Do to Stop the...  0.146031   
4       @harmlesstree2 Here 11 https://t.co/xWJzpSodGj  0.009739   
..                                                 ...       ...   
495  fuck these bitch ass niggers how yall doing I'...  0.974994   
496  @ChrisWarcraft [ i am doing this to make the @...  0.047358   
497  Yes, I hated that school. A lot of bullies btw...  0.360951   
498  Salty af too! Lol @JayJTheKidd: Damn you mad b...  0.924899   
499  #Taliban terrorists killing innocent muslims a...  0.471738   

     is_cyberbullying  predicted  
0                   0          0  
1                   0          0  
2                   0          0  



