In [None]:
!pip install pandas==2.1.4
!pip install numpy==1.26.4
!pip install torch==2.0.1
!pip install transformers==4.38.2
!pip install sentence-transformers==2.5.1
!pip install detoxify==0.5.2
!pip install bertopic==0.16.2
!pip install scikit-learn==1.5.0
!pip install plotly==5.22.0
!pip install matplotlib==3.9.0
!pip install umap==0.1.1

In [None]:
import os
import numpy as np
import pandas as pd
import re
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import torch
from tqdm import trange

# Initial preprocessing #

In [None]:
LOCAL_PATH = 'raw_data'
GLOBAL_PATH = os.path.join(os.getcwd(), LOCAL_PATH)
MIN_LENGTH = 30
MAX_LENGTH = 150

In [None]:
def concatenate_and_drop_duplicates():
    print('Concatenation...', end=' ')
    full_df = None
    for name in os.listdir(LOCAL_PATH):
        file_path = os.path.join(GLOBAL_PATH, name)
        if name.endswith('txt'):
            with open(file_path, 'r', encoding='ISO-8859-1') as f:
                df = pd.DataFrame({'text': [line.strip() for line in f.readlines()[1:]]})
        else:
            df = pd.read_csv(file_path).drop(columns='humor_rating', errors='ignore')
            raw_text = df.get('text')
            if raw_text is None:
                raw_text = df.get('title') + ' ' + df.get('selftext')
                df['text'] = raw_text
            raw_text = raw_text.str.lower().replace('\W+', ' ', regex=True).str.strip()
            df['raw_text'] = raw_text
        full_df = pd.concat((full_df, df), ignore_index=True)
    print('End...')
    df = full_df.drop_duplicates(subset='raw_text', ignore_index=True).replace('\s+', ' ', regex=True)
    print(f'{len(df)} examples')
    return df


def delete_bad_examples(df):          #choosing jokes fitting common structure
    initial_len = len(df)
    print('Removing examples with symbols not in [^a-zA-Z ?!.,;:"\']...', end=' ')
    mask = ~df.text.str.contains('[^a-zA-Z ?!.,;:"\']', na=True)
    df = df[mask].reset_index(drop=True)
    print('End...')
    print(f'{len(df)}/{initial_len} examples left')
    return df


def sorting_by_length(df, column_name='raw_text', min_length=None, max_length=None):
    initial_len = len(df)
    print('Sorting by lengths...', end=' ')
    sorted_lengths = df.get(column_name).str.len().sort_values().dropna()
    if min_length is None:
        min_length = sorted_lengths.min()
    if max_length is None:
        max_length = sorted_lengths.max()
    sorted_lengths = sorted_lengths[(sorted_lengths >= min_length) & (sorted_lengths <= max_length)]
    df = df.iloc[sorted_lengths.index].reset_index(drop=True)
    print('End...')
    print(f'{len(df)}/{initial_len} examples left')
    return df


def get_setup_and_punchline(df):                                          #set-up -- punchline division
    initial_len = len(df)
    not_divided = (df.title != df.title) & (df.selftext != df.selftext)
    print(f'Start: {sum(not_divided)}/{len(df)} examples are not divided')
    text = df.text.str.rstrip('?!.:;, ')
    symbols = ['?', '!', '.', '[!?.]+', ':', ';', ',']
    for symbol in symbols:
        split = text.str.split(symbol)
        mask = not_divided & (split.str.len() == 2)
        not_divided = not_divided & (split.str.len() != 2)
        print(f'After split by "{symbol}": {sum(not_divided)} examples are not divided yet')
        split = df[mask].text.str.split(symbol)
        symbol = symbol if len(symbol) == 1 else '.'
        df.loc[mask, 'title'] = split.map(lambda x: x[0].strip() + symbol)
        df.loc[mask, 'selftext'] = split.map(lambda x: x[1].strip())
    df = df.rename(columns={'title': 'set-up', 'selftext': 'punchline'}).dropna(subset=['set-up', 'punchline'])
    df = df.loc[:, ['score', 'set-up', 'punchline']].sample(frac=1).reset_index(drop=True)
    df = df[df.punchline.str.len() > 4].reset_index(drop=True)
    print(f'{len(df)}/{initial_len} examples left')
    return df


#dataset = concatenate_and_drop_duplicates()
dataset = pd.read_csv('all_data.csv')                                 #concatenated english humour datasets
dataset = delete_bad_examples(dataset)
dataset = sorting_by_length(dataset, min_length=MIN_LENGTH, max_length=MAX_LENGTH)
dataset = get_setup_and_punchline(dataset)
dataset.to_csv('initially_preprocessed_data_en.csv', index=False)
dataset

# Deduplication process #

In [None]:
#dataset = pd.read_csv('initially_preprocessed_data_en.csv')    #read the previous stage result
dataset['full_joke'] = dataset['set-up'] + " " + dataset['punchline']
dataset

In [None]:
BATCH_SIZE = 128


def product(x, y=None, batch_size=None):
    if y is None:
        y = x
    if batch_size is None:
        return x.dot(y.T)
    result = np.zeros((x.shape[0], y.shape[0]), dtype='float16')
    for i in trange((len(x) + batch_size - 1) // batch_size):
        left, right = i * batch_size, (i + 1) * batch_size
        result[left:right] = x[left:right].dot(y.T)
    return result

In [None]:
device = 'cuda'  # choose your device (cpu, cuda, mps)
model = 'all-mpnet-base-v2'  # Sentence BERT model

model = SentenceTransformer(model, device=device)  # choose your device (cpu, cuda, mps)
jokes = list(dataset['full_joke'].replace(to_replace=r'[^\w\s]', value='', regex=True).str.lower())
with torch.no_grad():
    x = []
    for i in trange((len(jokes) + BATCH_SIZE - 1) // BATCH_SIZE):
        batch = jokes[i*BATCH_SIZE:(i+1)*BATCH_SIZE]
        x.append(model.encode(batch))
model, dataset, jokes, batch = None, None, None, None

x = np.concatenate(x)
x /= np.linalg.norm(x, axis=1, keepdims=True)
x = product(x, batch_size=BATCH_SIZE)  # cosine similairy calculation
indexes_1, indexes_2 = np.where((x >= 0.7) & (~np.tri(len(x), dtype=bool)))  # threshold 0.7, adjust if necessary
x = x[indexes_1, indexes_2]
data = pd.DataFrame({'indexes_1': indexes_1, 'indexes_2': indexes_2, 'cos': x})
data.to_csv('sbert_duplicates_en.csv', index=False)  # file containing indexes of duplicating jokes
data

In [None]:
ind_drop = []
ind_keep = []

for i in range(len(data)):
    if data['indexes_1'][i] in ind_keep:
        if data['cos'][i] > 0.7:  # from set of duplicating jokes with cosine similarity higher than 0.7, 
            ind_drop.append(data['indexes_2'][i])  # we leave only first joke
    else:
        ind_keep.append(data['indexes_1'][i])

In [None]:
len(set(ind_drop))  # the number of duplicates

In [None]:
dataset = dataset.drop(ind_drop)
dataset.to_csv('dataset_without_dups_en.csv', index=False)  # dataset without duplicates

# Sentiment analysis #

In [None]:
BATCH_SIZE = 2048
device = 'cuda'  # choose your device (cpu, cuda, mps)
model = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"  # zero-shot classifier for sentiment analysis

if __name__ == '__main__':
    data = pd.read_csv('dataset_without_dups_en.csv')
    #data = dataset                                    
    data['label'] = ['' for i in range(len(data))]
    
    labels = []
    classifier = pipeline("zero-shot-classification", model=model, device=device)
    candidate_labels = ["politics", "neutral", "racist", "offending", "drugs", "alcohol"]          
    print('model is ready')
    for i in trange((len(data['full_joke']) + BATCH_SIZE - 1) // BATCH_SIZE):
        inputs = list(data.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE]['full_joke'])
        outputs = classifier(inputs, candidate_labels, multi_label=False)
        labels.append([output['labels'][np.argmax(output['scores'])] for output in outputs])

    data['label'] = np.concatenate(labels)
    data.to_csv('labeled_dataset_without_dups_en.csv', index=False)  # file containing deduplicated jokes with labels 

In [None]:
drop_i = []
data = pd.read_csv('labeled_dataset_without_dups_en.csv')  # read previous stage result or comment this line


for i in range(len(data)):
    if data['label'][i] == 'politics':  # removing inappropriate jokes
        drop_i.append(i)
    if data['label'][i] == 'offending':
        drop_i.append(i)
    if data['label'][i] == 'racist':
        drop_i.append(i)
    if data['label'][i] == 'drugs':
        drop_i.append(i)
    if data['label'][i] == 'alcohol':
        drop_i.append(i)
data = data.drop(drop_i)
data.to_csv('filtered_en.csv', index=False)  # filtered jokes
data

# Topic modeling #

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [None]:
representation_model = KeyBERTInspired()
cluster_model = KMeans(n_clusters=100)  # choosing modeling algorithm 
topic_model = BERTopic("english", verbose=True, representation_model=representation_model, hdbscan_model=cluster_model)

In [None]:
df = pd.read_csv('filtered_en.csv')         #reading previous stage results
#df = data
df

In [None]:
topics, probs = topic_model.fit_transform(df['full_joke'].to_list())  # fitting BERTopic model

In [None]:
freq = topic_model.get_topic_info()
freq.to_csv('BERTopik_eng_kmeans_100.csv')  # file containing 100 topics with examples and key words

In [None]:
freq.head(10)

In [None]:
import plotly.io as pio
pio.renderers.default='iframe'
topic_model.visualize_topics().show()  # the bubble clusters representation

In [None]:
docs = df['full_joke'].to_list()
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)

In [None]:
import matplotlib.pyplot as plt
import umap
import numpy as np
from adjustText import adjust_text

umap_model = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine')  # another clusters representation
reduced_embeddings = umap_model.fit_transform(embeddings)

document_info = topic_model.get_document_info(docs)
topic_sizes = topic_model.get_topic_freq()

centroids = {}
for topic in topic_sizes.Topic:
    indices = document_info[document_info.Topic == topic].index
    centroids[topic] = np.mean(reduced_embeddings[indices], axis=0)

topic_labels = topic_model.get_topic_info()

fig, ax = plt.subplots(figsize=(14, 10))

scatter = ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], 
                     c=[topic_to_color[topic] for topic in document_info.Topic], 
                     cmap='Spectral', s=2, alpha=0.5)

texts = []
for topic, size in topic_sizes.itertuples(index=False):
    if size > 100:  # showing labels only for big clusters
        cluster_center = centroids[topic]
        topic_label = topic_labels[topic_labels.Topic == topic].Name.values[0].split('_')[1]  # only forst word from cluster name
        texts.append(ax.text(cluster_center[0], cluster_center[1], topic_label, fontsize=10, ha='center', va='center', 
                             bbox=dict(facecolor='white', alpha=0.6, edgecolor='black', boxstyle='round,pad=0.5')))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='black'))

ax.set_title("BERTopic clusters of CleanComedy English")
plt.show()

In [None]:
topic_model.save("topic_model_en", serialization="pickle")  # saving our topic model

In [None]:
drop_topics = [96, 83, 70, 38, 34, 33, 19, 6, 2]  # after reading cluster names, removing jokes from inappropriate ones
drop_index = []

for i in range(len(df)):
    if df['Topic'][i] in drop_topics:
        drop_index.append(i)
df = df.drop(drop_index)
df