In [None]:
import json
import pandas as pd
from detoxify import Detoxify
import matplotlib.pyplot as plt
import torch
from transformers import pipeline
import ast
import numpy as np
import re
from sentence_transformers import SentenceTransformer
import torch
from tqdm import trange

In [None]:
with open('test.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
    
df_test = pd.DataFrame(list(data.items()), columns=['text', 'label'])
df_test

In [None]:
with open('train.json', 'r', encoding='utf-8') as file:
    data = json.load(file)
    
df_train = pd.DataFrame(list(data.items()), columns=['text', 'label'])
df_train

In [None]:
df = pd.concat([df_train, df_test])
df.to_csv('all_russian.csv', index=False)
df

In [None]:
funny_texts = df[df['label'] == 1]['text'].tolist()              #choosing only jokes
len(funny_texts)

# Detoxify multilingual (XLM RoBERTa)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Detoxify('multilingual', device=device)

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

toxicity_scores_df = pd.DataFrame()

for chunk in chunker(funny_texts, 10000):
    results = model.predict(chunk)
    chunk_df = pd.DataFrame(results)
    toxicity_scores_df = pd.concat([toxicity_scores_df, chunk_df], ignore_index=True)

In [None]:
toxicity_scores_df

In [None]:
toxicity_scores_df.to_csv('toxicity_scores_detoxify.csv', index=False)      #file comtaining different toxicity labels Detoxify predictions

# ruBERT toxic classifier

In [None]:
model_name = "IlyaGusev/rubertconv_toxic_clf"
pipe = pipeline("text-classification", model=model_name, tokenizer=model_name, framework="pt") 

In [None]:
funny_texts = df[df['label'] == 1]['text'].tolist()
result = pipe(funny_texts)

In [None]:
len(result)

In [None]:
with open('tox_results.txt', 'w') as f:          #file containing ruBERT predictions
    f.write(str(result))

In [None]:
with open('tox_results.txt') as file:
    data_string = file.read()
    scores_cls = ast.literal_eval(data_string)

In [None]:
scores_cls[:10]

## Toxicity analysis

In [None]:
with open('tox_results.txt') as file:
    data_string = file.read()
    scores_cls = ast.literal_eval(data_string)
df = pd.DataFrame(scores_cls)
df.to_csv('tox_results.csv', index=False)
df

In [None]:
df = pd.read_csv('all_russian.csv')       #FUN dataset train+test only jokes
df

In [None]:
dataset = pd.DataFrame()
dataset['text'] = df['text']
dataset

In [None]:
with open('tox_results.txt') as file:               #ruBERTConv Toxicity Classifier predictions
    data_string = file.read()
    scores_cls = ast.literal_eval(data_string)
df = pd.DataFrame(scores_cls)
df

In [None]:
labels = []
for i in range(len(df)):
    if df['label'][i] == 'toxic':
        labels.append(1)
    else:
        labels.append(0)
dataset['rubert'] = labels
dataset

In [None]:
df = pd.read_csv('toxicity_scores_detoxify.csv')           #Multilingual Detoxify (XLM RoBERTa) toxicity predictions
df 

In [None]:
xlm = []
for i in range(len(df)):
    xlm.append(df['toxicity'][i])

dataset['xlm'] = xlm
dataset

In [None]:
dataset.to_csv('compare_tox.csv')           #comparable table

In [None]:
df = pd.read_csv('compare_tox.csv')
df

In [None]:
drop_i = []
for i in range(len(df)):
    if df['rubert'][i] == 1 or df['xlm'][i] >= 0.1:               #deleting all toxic content 
        drop_i.append(i)
filt_df = df.drop(drop_i)
filt_df

In [None]:
filt_df.to_csv('rus_filtered.csv')            #detoxified Russian jokes

# Deduplication process

In [None]:
dataset = pd.read_csv("rus_filtered.csv")        #read results from the previous step
dataset

In [None]:
BATCH_SIZE = 256


def product(x, y=None, batch_size=None):
    if y is None:
        y = x
    if batch_size is None:
        return x.dot(y.T)
    result = np.zeros((x.shape[0], y.shape[0]), dtype='float16')
    for i in trange((len(x) + batch_size - 1) // batch_size):
        left, right = i * batch_size, (i + 1) * batch_size
        result[left:right] = x[left:right].dot(y.T)
    return result

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


In [None]:
from transformers import AutoTokenizer, AutoModel

model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'


if __name__ == '__main__':
    model = SentenceTransformer(model=model, device=device)
    jokes = list(dataset['text'])

    with torch.no_grad():
        x = []
        for i in trange((len(jokes) + BATCH_SIZE - 1) // BATCH_SIZE):
            batch = jokes[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
            x.append(model.encode(batch))
    model, dataset, jokes, batch = None, None, None, None

    x = np.concatenate(x)
    x /= np.linalg.norm(x, axis=1, keepdims=True)
    x = product(x, batch_size=BATCH_SIZE)                                                #cosine similairy calculation
    indexes_1, indexes_2 = np.where((x >= 0.923) & (~np.tri(len(x), dtype=bool)))        #threshold 0.7, adjust if necessary
    x = x[indexes_1, indexes_2]
    data = pd.DataFrame({'indexes_1': indexes_1, 'indexes_2': indexes_2, 'cos': x})          
    data.to_csv('sbert_duplicates_ru.csv', index=False)                                        #file containing indexes of duplicating jokes
data

In [None]:
ind_drop = []
ind_keep = []

for i in range(len(data)):
    if data['indexes_1'][i] in ind_keep:
        if data['cos'][i] >= 0.923:                       #from set of duplicating jokes with cosine similarity higher than 0.923, 
            ind_drop.append(data['indexes_2'][i])         #we leave only first joke
            ind_keep.append(data['indexes_2'][i])
    else:
        ind_keep.append(data['indexes_1'][i])
        ind_drop.append(data['indexes_2'][i])

In [None]:
len(set(ind_drop))                          #the number of duplicates

In [None]:
dataset = dataset.drop(ind_drop)
dataset.to_csv('dataset_without_dups_ru.csv', index=False)                #dataset without duplicates

# Sentiment analysis #

In [None]:
BATCH_SIZE = 2048
model = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"           #zero-shot classifier for sentiment analysis
device = 'cuda' if torch.cuda.is_available() else 'cpu'

if __name__ == '__main__':
    data = pd.read_csv('rus_filtered_without_dup_test.tsv', delimiter='\t')
    #data['full_joke'] = data['set-up'] + ' ' + data['punchline']
    data['label'] = ['' for i in range(len(data))]

    labels = []
    classifier = pipeline("zero-shot-classification", model=model, device=device)
    candidate_labels = ["politics", "neutral", "offending", "alcohol", "racist", "drugs"]
    print('model is ready')
    for i in trange((len(data['text']) + BATCH_SIZE - 1) // BATCH_SIZE):
        inputs = list(data.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE]['text'])
        outputs = classifier(inputs, candidate_labels, multi_label=False)
        labels.append([output['labels'][np.argmax(output['scores'])] for output in outputs])

    data['label'] = np.concatenate(labels)
    data.to_csv('labeled_dataset_without_dups_ru.csv', index=False)        #file containing deduplicated jokes with labels 

In [None]:
drop_i = []
data = pd.read_csv('labeled_dataset_without_dups_ru.csv')          #read previous stage result or comment this line


for i in range(len(data)):
    if data['label'][i] == 'politics':                             #removing inappropriate jokes
        drop_i.append(i)
    if data['label'][i] == 'offending':
        drop_i.append(i)
    if data['label'][i] == 'drugs':
        drop_i.append(i)
    if data['label'][i] == 'alcohol':
        drop_i.append(i)
data = data.drop(drop_i)
data.to_csv('filtered_ru.csv', index=False)                        #filtered jokes
data

# Topic modeling #

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [None]:
representation_model = KeyBERTInspired()
cluster_model = KMeans(n_clusters=100)             #choosing modeling algorithm 
topic_model = BERTopic("russian", verbose=True, representation_model=representation_model, hdbscan_model=cluster_model)

In [None]:
df = pd.read_csv('filtered_ru.csv')                 #reading previous stage results
df

In [None]:
topics, probs = topic_model.fit_transform(df['text'].to_list())        #fitting BERTopic model

In [None]:
freq = topic_model.get_topic_info()
freq.to_csv('BERTopik_ru_kmeans_100_.csv')         #file containing 100 topics with examples and key words

In [None]:
freq.head(10)

In [None]:
import plotly.io as pio
pio.renderers.default='iframe'
topic_model.visualize_topics().show()                    #the bubble clusters representation

In [None]:
from sentence_transformers import SentenceTransformer

docs = df['text'].to_list()
sentence_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [None]:
import matplotlib.pyplot as plt
import umap
import numpy as np
from adjustText import adjust_text
import spacy
from deep_translator import GoogleTranslator

nlp = spacy.load("ru_core_news_sm")                                                      #another clusters representation
translator = GoogleTranslator(source='auto', target='en')

umap_model = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings)

document_info = topic_model.get_document_info(docs)
topic_sizes = topic_model.get_topic_freq()

centroids = {}
for topic in topic_sizes.Topic:
    indices = document_info[document_info.Topic == topic].index
    centroids[topic] = np.mean(reduced_embeddings[indices], axis=0)

topic_labels = topic_model.get_topic_info()

unique_topics = document_info.Topic.unique()
topic_to_color = {topic: idx for idx, topic in enumerate(unique_topics)}
colors = [topic_to_color[topic] for topic in document_info.Topic]

def lemmatize_and_translate(label):
    try:
        doc = nlp(label.split('_')[0])
        lemmatized_label = doc[0].lemma_
        #print(f"Lemmatized label: {lemmatized_label}")
        translated_label = translator.translate(lemmatized_label)
        #print(f"Translated label: {translated_label}")
        return f"{lemmatized_label} ({translated_label})"
    except Exception as e:
        #print(f"Error translating label '{label}': {e}")
        return label

fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], 
                     c=colors, cmap='Spectral', s=2, alpha=0.5)

texts = []
for topic, size in topic_sizes.itertuples(index=False):
    if size > 300:  # Показывать аннотации только для крупных тем
        cluster_center = centroids[topic]
        topic_label = topic_labels[topic_labels.Topic == topic].Name.values[0].split('_')[1]  # Берем только первое слово
        annotated_label = lemmatize_and_translate(topic_label)
        texts.append(ax.text(cluster_center[0], cluster_center[1], annotated_label, fontsize=10, ha='center', va='center', 
                             bbox=dict(
                                 facecolor='white', alpha=0.6, edgecolor='black', boxstyle='round,pad=0.5')))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))

ax.set_title("UMAP projection of BERTopic clusters")
plt.show()


In [None]:
topic_model.save("topic_model", serialization="pickle")      #saving our topic model

In [None]:
df = topic_model.get_document_info(df['text'].to_list())
df

In [None]:
drop_topics = [2, 4, 6, 10, 12, 13, 17, 28, 29, 38, 51, 58, 59, 76, 78, 87, 90, 94, ]
drop_index = []                  #after reading cluster names, removing jokes from inappropriate ones

for i in range(len(df)):
    if df['Topic'][i] in drop_topics:
        drop_index.append(i)
df = df.drop(drop_index)
df

In [None]:
df.to_csv('clean_comedy_ru.csv', index=False)                      #final cleared dataset 