In [1]:
## Imports
#%%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import json

#%%
# Load the data
ORIGINAL_DATA_DIR = 'original_data'
CLEANED_DATA_DIR = 'data'

data = {}
banned_users = []
for subfolder in os.listdir(ORIGINAL_DATA_DIR):
    folder_name = subfolder.split("_")[-1]
    banned_users.append(folder_name)
    data[folder_name] = {}

    labeled_json_data = "0_" + subfolder + ".json"
    file_path = os.path.join(CLEANED_DATA_DIR, labeled_json_data)
    with open(file_path, 'r', encoding='utf-8-sig') as f:
        raw_data = json.load(f)

    # Access the nested labels
    labels = raw_data['data']['labels']
    data[folder_name]["labeled_json_data"] = pd.DataFrame(labels)
    for file in os.listdir(os.path.join(ORIGINAL_DATA_DIR, subfolder)):
        data[folder_name][file] = pd.read_json(os.path.join(ORIGINAL_DATA_DIR, subfolder, file), lines=True)


In [2]:
from collections import Counter
number_of_data = [len(value.keys()) for key, value in data.items()]
counter = Counter(number_of_data)
counter

Counter({5: 4178})

In [3]:
USER = "ideon1"

In [4]:
data[USER]["bad_utterance.jsonl"]

Unnamed: 0,timestamp,user,display_name,message,is_emotes_only,emotes,message_id
0,2022-08-31 10:32:08,ideon1,ideon1,the CYANDYMYAN IS COMING FOR U,False,,c8ba4869-c7f6-48d6-a024-a9af835b275e


In [5]:
data[USER]["channel_setting.jsonl"]

Unnamed: 0,started_at,timezone,title,game_name,language,tag_names,streamer,display_name
0,2022-08-31 09:50:29,pst,TRUMP GETTING OWNED BY THE DOJ ----UKRAINE NUC...,Just Chatting,en,[English],hasanabi,HasanAbi


In [6]:
data[USER]["single_user_context.jsonl"] ## We can get the ban data from here

Unnamed: 0,timestamp,user,display_name,message,is_emotes_only,emotes,message_id,action
0,2022-08-31 10:32:08,ideon1,ideon1,the CYANDYMYAN IS COMING FOR U,0.0,,c8ba4869-c7f6-48d6-a024-a9af835b275e,message
1,2022-08-31 10:32:13,ideon1,,,,,,ban
2,2022-08-31 10:32:13,ideon1,,,,,,ban


In [7]:
## select rows where action is ban
#data[USER]["multi_user_context.jsonl"].query("action == 'ban'")
data[USER]["multi_user_context.jsonl"]

Unnamed: 0,timestamp,user,display_name,message,is_emotes_only,emotes,message_id,action
0,2022-08-31 10:30:13,blueseven01,Blueseven01,widepeepoSad,0.0,,ddf3f089-457d-4708-bdb5-627602ab4d91,message
1,2022-08-31 10:30:13,hollywoodconner,HollywoodConner,i love you but this looks like actual boomer p...,0.0,,c197a846-3d27-4aa9-a552-161f957a7239,message
2,2022-08-31 10:30:13,mathew1025,mathew1025,Would have done better…,0.0,,21311aed-0227-480a-8750-814b992b3952,message
3,2022-08-31 10:30:13,mrtripp,mrtripp,Maybe this is related to the trump leaks? *Put...,0.0,,5c53a716-78cb-4e90-a5e0-29da462cb32f,message
4,2022-08-31 10:30:13,foxystreamthis,foxystreamthis,get that froste contract,0.0,,c3511486-3db4-4c43-abff-5cbfe77d5494,message
...,...,...,...,...,...,...,...,...
593,2022-08-31 10:32:13,pastadactyl,pastadactyl,Is that austin flying,0.0,,75090f5e-c2ab-493d-ad6c-0f36018c897e,message
594,2022-08-31 10:32:13,hasanabi,HasanAbi,https://twitter.com/hasanthehun/status/1565043...,0.0,,8c171262-33a2-417e-91e2-74e5795a5af2,message
595,2022-08-31 10:32:13,fossabot,Fossabot,peepoHas Thank you Trismatics for the 5 giftie...,0.0,,9acd1153-5854-4c31-b1b6-32ae2673b14d,message
596,2022-08-31 10:32:13,hasanabi,HasanAbi,https://twitter.com/hasanthehun/status/1565043...,0.0,,bd9560e0-5613-4888-98e3-78ad0ad07f32,message


In [8]:
data[USER]["labeled_json_data"]

Unnamed: 0,anotation_id,step_1,target_1,step_2,target_2,step_3,target_3,comment,streamer_knowledge,twitch_knowledge
0,0,[판단불가],,[Mentioning other broadcasters],,[Mentioning other broadcasters],,,,
1,1,[Incivility],,[Mentioning other broadcasters],,[Mentioning other broadcasters],,[Candy man represents Sam Hyde],,
2,2,"[HIB, Mentioning other broadcasters]",[Broadcaster],"[HIB, Mentioning other broadcasters]",[Broadcaster],"[HIB, Mentioning other broadcasters]",[Broadcaster],,,


In [9]:
data["theefinessekidd"].keys()

dict_keys(['labeled_json_data', 'bad_utterance.jsonl', 'multi_user_context.jsonl', 'channel_setting.jsonl', 'single_user_context.jsonl'])

In [10]:
USER = "agase1"

In [11]:
data[USER]["bad_utterance.jsonl"]["message"].item()

'MOD THAT BANNED ME IS GOING TO DIE OF CANCER BEFORE 2023. I KNOW U HERE NOOB'

In [12]:
data[USER]["labeled_json_data"]

Unnamed: 0,anotation_id,step_1,target_1,step_2,target_2,step_3,target_3,comment,streamer_knowledge,twitch_knowledge
0,0,[HIB],[Others in broadcast],[HIB],[Others in broadcast],[HIB],[Others in broadcast],,,
1,1,[HIB],[Others in broadcast],[HIB],[Others in broadcast],[HIB],[Others in broadcast],,,
2,2,[HIB],[Others in broadcast],[HIB],[Others in broadcast],[HIB],[Others in broadcast],[to moderators],,


In [13]:
import pandas as pd

results = []

for user in data:
    user_data = data[user]

    # Get message and labels
    message = user_data["bad_utterance.jsonl"]["message"].item()
    df = pd.DataFrame(user_data["labeled_json_data"])

    for _, row in df.iterrows():
        steps = [row['step_1'], row['step_2'], row['step_3']]

        # Skip if any step contains "Spesific Language Only"
        if any('Specific Language Only' in label for step in steps for label in step):
            continue

        # Count how many steps contain "HIB"
        hib_count = sum('HIB' in step for step in steps)
        is_hib = 1 if hib_count >= 2 else 0

        results.append({
            'user': user,
            'message': message,
            'is_HIB': is_hib
        })

# Create final DataFrame
final_df = pd.DataFrame(results)

# Group by user and message, then majority vote on is_HIB
final_df = (
    final_df
    .groupby(['user', 'message'], as_index=False)
    .agg(lambda x: 1 if sum(x) >= len(x) / 2 else 0)  # majority voting
)


# Show or save it
print(final_df)

                           user  \
0                                 
1                             0   
2                            00   
3                    000zayn000   
4                           007   
...                         ...   
3928             zvezdaplatinum   
3929                   zwezwe18   
3930                     zync94   
3931                       zyro   
3932  zzzzzzzzzzzzzzzzzzzzzzzyx   

                                                message  is_HIB  
0                               get on some rumbleverse       0  
1              can you perma ban me plzzzz hasan dadddy       1  
2                people still play cs? eww go back 2015       1  
3                     Candy man caled you out @HasanAbi       0  
4                           today is matchhhhhhhh??????       0  
...                                                 ...     ...  
3928                             mental health OMEGALUL       0  
3929                                  <message dele

In [14]:
final_df.iloc[200:250]

Unnamed: 0,user,message,is_HIB
200,aglamaanne03,imGlitch Attention! Golden imGlitch Attention!...,0
201,agramcitizen,@Coldblood06 u stupid or wath,1
202,ahteckkkkk,EDG = EAT DOG GAMING,1
203,aic3e,WHATS HER @ YO,0
204,aidentantannn,bcneck,0
205,aidsanchex,SO BAD,1
206,aikofnbr,is ente 74 cheating btw? im seeing so many rom...,0
207,aim,"save the earth, kill YS?",1
208,aimassist,Emu taking back shots,0
209,ainul05,BOOBA,0


In [15]:
final_df.iloc[248]["message"]

'NO STELLARB PepeHands NO STELLARB PepeHands NO STELLARB PepeHands NO STELLARB PepeHands NO STELLARB PepeHands NO STELLARB PepeHands NO STELLARB PepeHands'

In [16]:
hib_counts = final_df['is_HIB'].value_counts()
print(hib_counts)

is_HIB
0    2581
1    1352
Name: count, dtype: int64


In [40]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TextClassificationPipeline
import torch

# Load HateBERT model and tokenizer
model_name = "/home/efe/Desktop/Multires_v2/Deep_Learning/HateBERT_hateval"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create classification pipeline
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, truncation=True,device = "cuda")
label_map = {
    "LABEL_0": "non_hate",
    "LABEL_1": "hate"
}
def classify_toxicity(text):
    result = pipeline(text)[0]
    return {label_map[entry['label']]: entry['score'] for entry in result}

# Drop or filter out non-strings
final_df_clean = final_df[final_df['message'].apply(lambda x: isinstance(x, str))].copy()

# Drop duplicates
final_df_unique = final_df_clean.drop_duplicates(subset='message').copy()

# Apply only to unique messages
final_df_unique['toxicity_scores'] = final_df_unique['message'].apply(classify_toxicity)



In [48]:
sample = "Damn nigger."

scores = classify_toxicity(sample)
print(scores)

{'non_hate': 0.8109367489814758, 'hate': 0.18906326591968536}


In [42]:
print(model.config.id2label)

{0: 'LABEL_0', 1: 'LABEL_1'}


In [54]:

final_df_unique['toxicity_scores'][40]

{'non_hate': 0.7272173166275024, 'hate': 0.27278268337249756}

In [None]:
def predict_hate(score_dict, threshold=0.2):
    print(score.get('hate,0'))
    return int(score_dict.get('hate', 0) > threshold)

final_df_unique['predicted_is_hate'] = final_df_unique['toxicity_scores'].apply(predict_hate)

In [58]:
from sklearn.metrics import accuracy_score, classification_report

# Just to be safe, drop missing predictions
eval_df = final_df_unique.dropna(subset=['predicted_is_HIB'])

# Calculate accuracy
accuracy = accuracy_score(eval_df['is_HIB'], eval_df['predicted_is_HIB'])
print(f"Accuracy: {accuracy:.4f}")

# Optional: get precision, recall, F1
print(classification_report(eval_df['is_HIB'], eval_df['predicted_is_HIB']))

Accuracy: 0.6418
              precision    recall  f1-score   support

           0       0.64      1.00      0.78      2394
           1       0.00      0.00      0.00      1336

    accuracy                           0.64      3730
   macro avg       0.32      0.50      0.39      3730
weighted avg       0.41      0.64      0.50      3730



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
final_df_unique.iloc[100:120]