In [1]:
import os
import shutil
import numpy as np
import matplotlib.pyplot as plt
import random
import json

# Read Binary and Range Data

In [2]:
dataset_dir = 'HumorDB_/Dataset'
dataset_images = []
train_images = []
valid_images = []
test_images = []
train_funny = []
train_not_funny = []
valid_funny = []
valid_not_funny = []
test_funny = []
test_not_funny = []
for fold in ['Train', 'Valid', 'Test']:
    for fold1 in ['Funny', 'Not_Funny']:
        for file in os.listdir(os.path.join(dataset_dir, fold, fold1)):
            dataset_images.append(file)
            if fold == 'Train':
                train_images.append(file)
                if fold1 == 'Funny':
                    train_funny.append(file)
                else:
                    train_not_funny.append(file)
            elif fold == 'Valid':
                valid_images.append(file)
                if fold1 == 'Funny':
                    valid_funny.append(file)
                else:
                    valid_not_funny.append(file)
            else:
                test_images.append(file)
                if fold1 == 'Funny':
                    test_funny.append(file)
                else:
                    test_not_funny.append(file)
print('Dataset size:', len(dataset_images))
print('Train size:', len(train_images), len(train_funny), len(train_not_funny))
print('Valid size:', len(valid_images), len(valid_funny), len(valid_not_funny))
print('Test size:', len(test_images), len(test_funny), len(test_not_funny))

#sanity check
for file in train_images:
    assert file not in valid_images
    assert file not in test_images

for file in valid_images:
    assert file not in train_images
    assert file not in test_images

for file in test_images:
    assert file not in train_images
    assert file not in valid_images


Dataset size: 3545
Train size: 2136 1068 1068
Valid size: 703 351 352
Test size: 706 352 354


In [3]:
def read_binary_ratings(file_path):
    user_ratings = {}
    count_diff = 0
    num_repeat = 0
    all_ratings = []
    with open(os.path.join(file_path)) as f:
        lines = f.readlines()
        i = 4
        while i < len(lines):
            line = lines[i]
            line = line.strip()
            assert ',' in line or i == len(lines) - 1
            line = line.split(',')
            assert line[1] in ['0', '1']
            all_ratings.append(line)
            if line[0].strip() in user_ratings:
                num_repeat += 1
                if user_ratings[line[0].strip()] != int(line[1].strip()):
                    count_diff += 1
            else:
                user_ratings[line[0].strip()] = int(line[1].strip())
            i += 1
    fir_len = len(all_ratings[0])
    for i in range(len(all_ratings)):
        if len(all_ratings[i]) != fir_len:
            print('fix_len', file_path)
            break
    if count_diff > 3:
        print(file_path, count_diff)
    return user_ratings, count_diff, num_repeat

def read_range_ratings(file_path):
    user_ratings = {}
    count_diff = 0
    num_repeat = 0
    with open(os.path.join(file_path)) as f:
        lines = f.readlines()
        i = 4
        while i < len(lines):
            line = lines[i]
            line = line.strip()
            assert ',' in line or i == len(lines) - 1
            line = line.split(',')
            if line[0].strip() in user_ratings:
                num_repeat += 1
                assert np.abs(user_ratings[line[0]] - int(line[1])) <= 4
                if np.abs(user_ratings[line[0].strip()] - int(line[1].strip())) > 2:
                    count_diff += 1
            else:
                user_ratings[line[0].strip()] = int(line[1].strip())
            i += 1
        
    if count_diff > 4:
        print(file_path, count_diff)
    return user_ratings, count_diff, num_repeat

In [4]:
binary_dir = 'HumorDB_/user_binary'
range_dir = 'HumorDB_/user_range'

# for i, file in enumerate(os.listdir(binary_dir)):
#     os.rename(os.path.join(binary_dir, file), os.path.join(binary_dir, f'user_{i}.txt'))

# for i, file in enumerate(os.listdir(range_dir)):
#     os.rename(os.path.join(range_dir, file), os.path.join(range_dir, f'user_{i}.txt'))

In [5]:
all_binary_ratings = {}
all_range_ratings = {}
for file in os.listdir(binary_dir):
    user_ratings, count_diff, num_repeat = read_binary_ratings(os.path.join(binary_dir, file))
    if num_repeat <= 0:
        print(f"Number of repeats for {file} is {num_repeat}")
    
    for key, value in user_ratings.items():
        if key in all_binary_ratings:
            all_binary_ratings[key].append(value)
        else:
            all_binary_ratings[key] = [value]

for file in os.listdir(range_dir):
    user_ratings, count_diff, num_repeat = read_range_ratings(os.path.join(range_dir, file))
    if num_repeat <= 0:
        print(f"Number of repeats for {file} is {num_repeat}")
    
    for key, value in user_ratings.items():
        if key in all_range_ratings:
            all_range_ratings[key].append(value)
        else:
            all_range_ratings[key] = [value]

In [6]:
#check sanity
for key, value in all_binary_ratings.items():
    if key not in dataset_images:
        continue
    if len(value) < 5:
        print(f"Img {key} has less than 5 ratings in binary ratings")
    if len(all_range_ratings[key]) < 5:
        print(f"Img {key} has less than 5 ratings in range ratings")

In [7]:
range_ratings = {}
range_ratings_mean = {}
count = 0
for key, value in all_range_ratings.items():
    new_value = []
    mean = np.mean(value)
    std = np.std(value)
    for v in value:
        if np.abs((v - mean)/std > 1.95):
            continue
        new_value.append(v)
    assert len(new_value) >= 1
    range_ratings[key] = new_value
    range_ratings_mean[key] = np.mean(new_value)

binary_ratings = {}
binary_ratings_mean = {}
for key, value in all_binary_ratings.items():
    new_value = []
    mean = np.mean(value)
    std = np.std(value)
    for v in value:
        if np.abs((v - mean)/std > 1.95):
            continue
        new_value.append(v)
    assert len(new_value) >= 1
    binary_ratings[key] = new_value
    binary_ratings_mean[key] = np.mean(new_value)

  if np.abs((v - mean)/std > 1.95):
  if np.abs((v - mean)/std > 1.95):


In [8]:
#calculate mean and std for binary/range ratings across train/funny, train/not_funny, valid/funny, valid/not_funny, test/funny, test/not_funny

bin_ratings = {}
ran_ratings = {}
for fold in ['Train', 'Valid', 'Test']:
    for fold1 in ['Funny', 'Not_Funny']:
        for file in os.listdir(os.path.join(dataset_dir, fold, fold1)):
            key = f"{fold}_{fold1}"
            if key not in bin_ratings:
                bin_ratings[key] = []
            bin_ratings[key].append(binary_ratings_mean[file])
            if key not in ran_ratings:
                ran_ratings[key] = []
            ran_ratings[key].append(range_ratings_mean[file])

for key, value in bin_ratings.items():
    print(f"{key} binary mean: {np.mean(value)} std: {np.std(value)}")

for key, value in ran_ratings.items():
    print(f"{key} range mean: {np.mean(value)} std: {np.std(value)}")

Train_Funny binary mean: 0.7920070150407228 std: 0.18764270837849592
Train_Not_Funny binary mean: 0.10171177397300994 std: 0.16594740169812525
Valid_Funny binary mean: 0.7829082440193551 std: 0.18574061907748785
Valid_Not_Funny binary mean: 0.10082972582972584 std: 0.16856507918991645
Test_Funny binary mean: 0.7667455808080806 std: 0.18668785801313897
Test_Not_Funny binary mean: 0.11110326428123041 std: 0.17052988007355036
Train_Funny range mean: 5.74312639928552 std: 1.424560486514096
Train_Not_Funny range mean: 3.601980681615513 std: 1.7587706898048678
Valid_Funny range mean: 5.678216720712951 std: 1.4096577689290064
Valid_Not_Funny range mean: 3.645642812049062 std: 1.6491873072297587
Test_Funny range mean: 5.589136526584304 std: 1.3803706978875394
Test_Not_Funny range mean: 3.402394921462718 std: 1.6852166870395475


In [9]:
#sanity check
for file in train_funny+valid_funny+test_funny:
    assert file not in train_not_funny+valid_not_funny+test_not_funny
    assert binary_ratings_mean[file] >= 0.5

for file in train_not_funny+valid_not_funny+test_not_funny:
    assert file not in train_funny+valid_funny+test_funny
    assert binary_ratings_mean[file] < 0.5

In [10]:
# with open('HumorDB_/range_ratings_mean.txt', 'w') as f:
#     for key, value in range_ratings_mean.items():
#         f.write(f"{key},{value}\n")

# with open('HumorDB_/binary_ratings_mean.txt', 'w') as f:
#     for key, value in binary_ratings_mean.items():
#         f.write(f"{key},{value}\n")

# Read Comparison Task

In [11]:
#for comp task

def extract_ratings_words(filename):
    ratings = []
    with open(filename, 'r') as f:
        lines = f.readlines()
        for line in lines[4:]:
            line = line.strip().split(',')
            ratings.append((line[0], line[1], int(line[2]), line[3]))
            # print('Error', filename, key, value)
    return ratings

compare_representatives = ['M319.jpg',
 'M37.jpg',
 'O1058.jpg',
 'M637.jpg',
 'O1709.jpg',
 'O1360.jpg',
 'O1748.jpg',
 'O8.jpg']

compare_reps2ids = {comp_rep:i for i, comp_rep in enumerate(compare_representatives)}
all_vectors = {}
for image in dataset_images:
    all_vectors[image] = np.zeros(len(compare_representatives))

In [12]:
all_done = set()
comp_dir = 'HumorDB_/user_comp'
image_words = {}
for file in os.listdir(comp_dir):
    all_ratings = extract_ratings_words(os.path.join(comp_dir, file))
    user_ratings = {}
    count_diff = 0
    num_repeat = 0
    for rating in all_ratings:
        img1, img2, comp_res, word = rating
        if comp_res == 1:
            if img1 not in image_words:
                image_words[img1] = []
            image_words[img1].append(word)
        else:
            if img2 not in image_words:
                image_words[img2] = []
            image_words[img2].append(word)
        imgs_key = tuple(sorted([img1, img2]))
        if imgs_key[0] != img1:
            comp_res = 2 if comp_res == 1 else 1
        if imgs_key in user_ratings:
            if user_ratings[imgs_key][0] != comp_res:
                count_diff += 1
            num_repeat += 1
        else:
            user_ratings[imgs_key] = (comp_res, word)
    
    if num_repeat <= 0:
        print(f"Number of repeats for {file} is {num_repeat}")
    if count_diff > 3:
        print(file, count_diff)
    
    for key, value in user_ratings.items():
        img1, img2 = key
        comp_res, word = value
        if img1 in compare_representatives and img2 in compare_representatives:
            if comp_res == 1:
                all_vectors[img1][compare_reps2ids[img2]] += 1
                all_vectors[img2][compare_reps2ids[img1]] -= 1
            else:
                all_vectors[img1][compare_reps2ids[img2]] -= 1
                all_vectors[img2][compare_reps2ids[img1]] += 1
            all_done.add((img1, img2))
            all_done.add((img2, img1))
        elif img1 in compare_representatives:
            rep_image = img1
            other_image = img2
            if comp_res == 1:
                all_vectors[other_image][compare_reps2ids[rep_image]] -= 1
            else:
                all_vectors[other_image][compare_reps2ids[rep_image]] += 1
            all_done.add((other_image, rep_image))
        elif img2 in compare_representatives:
            rep_image = img2
            other_image = img1
            if comp_res == 1:
                all_vectors[other_image][compare_reps2ids[rep_image]] += 1
            else:
                all_vectors[other_image][compare_reps2ids[rep_image]] -= 1
            all_done.add((other_image, rep_image))
        else:
            #dummy data ignore
            continue

10.csv 5
14.csv 4
5eac2a8bf51f94237e265659.csv 5
5f52db48fd3fc15e1711a91a.csv 4
6101327737f1cdc226011570.csv 4
6138b05474d6dbffc96ea42f.csv 4
653be97654266dc6f45b80ca.csv 4
99.csv 4


In [13]:
#sanity check
for image in dataset_images:
    for rep_image in compare_representatives:
        if (image, rep_image) not in all_done and (rep_image, image) not in all_done:
            print(f"Image {image} and {rep_image} not compared")

# Find Words Common

In [14]:
#extract common words
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from collections import Counter
from itertools import tee, zip_longest

def common_words(strings, percentage):
    # Tokenize and tag parts of speech for each string
    tagged_words = []
    for string in strings:
        words = word_tokenize(string)
        tagged_words.extend(pos_tag(words))

    # Filter out common nouns and verbs
    common_nouns_verbs = [word for word, pos in tagged_words if pos.startswith('NN') or pos.startswith('VB')]
    word_counts = nltk.FreqDist(common_nouns_verbs)
    threshold = len(strings) * percentage / 100
    common = [word for word, count in word_counts.items() if count >= threshold]

    return common

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

def ngrams(tokens, n):
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def common_phrases(strings, percentage, n=2):
    all_tokens = ' '.join(strings).split()

    # Generate n-grams
    all_ngrams = ngrams(all_tokens, n)

    # Count the occurrences of each n-gram
    ngram_counts = Counter(all_ngrams)
    threshold = len(strings) * percentage / 100


    common_phrases = [phrase for phrase, count in ngram_counts.items() if count >= threshold]

    return common_phrases

#sanity check
strings = ["hello world", "hello everyone", "hello there", "world is hello"]
percentage = 30  
print(common_phrases(strings, percentage))
print(common_words(strings, percentage))

[]
['hello', 'world']


In [15]:
image_common_words = {}
count = []
non_count = []
total = []
commoned = 0
for image, words in image_words.items():
    percentage = 30
    common = common_words(words, percentage)
    if len(common) > 0:
        image_common_words[image] = common
        count.append(image)
        commoned += len(words)
    else:
        non_count.append(image)
    total.append(image)
print(len(image_common_words), len(count), len(total), commoned, len(non_count))

2863 2863 3455 34402 592


In [16]:
# with open('image_common_words.txt', 'w') as f:
#     for key, value in image_common_words.items():
#         f.write(f"{key}, {' '.join(value)}\n")
len(image_words['M37.jpg']), len(image_common_words['M37.jpg']), image_common_words['M37.jpg']

(1755, 1, ['surgery'])

In [17]:
#good to check
funny = 0
not_funny = 0
for image in dataset_images:
    if np.mean(binary_ratings[image]) >= 0.5:
        if image in image_common_words:
            funny += 1
    else:
        if image in image_common_words:
            not_funny += 1
funny, not_funny

(1476, 1387)

In [18]:
#for comp task map to either 1, -1, 0
for image, vector in all_vectors.items():
    for i in range(len(vector)):
        if vector[i] > 0:
            vector[i] = 1
        elif vector[i] < 0:
            vector[i] = -1

In [19]:
labels_dict = {}
for image, vector in all_vectors.items():
    for idx, v in enumerate(vector):
        rep_image = compare_representatives[idx]
        labels_dict[f"{image}_{rep_image}"] = v
json.dump(labels_dict, open('labels_dict.json', 'w'))

# Cluster using Comparison Ratings

In [20]:
from gph.python import ripser_parallel

# import utils
import numpy as np
from gtda.homology._utils import _postprocess_diagrams

# to generate dataset
from sklearn import datasets

# plotting
import matplotlib.pyplot as plt
from gtda.plotting import plot_diagram, plot_point_cloud
import json

In [21]:
#make numpy array
#for mapping back ids
all_vectors_np_ids = {}
num_iter = 0
for image, vector in all_vectors.items():
    all_vectors_np_ids[num_iter] = image
    num_iter += 1
all_vectors_np = [[] for i in range(len(all_vectors_np_ids))]
for i, image in all_vectors_np_ids.items():
    all_vectors_np[i] = all_vectors[image]
all_vectors_np = np.array(all_vectors_np)
print(all_vectors_np.shape, len(all_vectors_np_ids))

(3545, 8) 3545


In [22]:
num_zeros = 0
total_possible_ratings = all_vectors_np.shape[0] * all_vectors_np.shape[1]
total_correct= 0
conflict_ratings = 0
for image, vector in all_vectors.items():
    for idx in range(len(vector)):
        rep_image = compare_representatives[idx]
        other_better = int(binary_ratings_mean[image] > binary_ratings_mean[rep_image])
        if vector[idx] == 0:
            num_zeros += 1
            continue
        actual_rating = int(vector[idx] > 0)
        if vector[idx] > 0:
            if binary_ratings_mean[image] < 0.5 and binary_ratings_mean[rep_image] >= 0.5:
                conflict_ratings += 1
        elif vector[idx] < 0:
            if binary_ratings_mean[image] >= 0.5 and binary_ratings_mean[rep_image] < 0.5:
                conflict_ratings += 1
        else:
            raise ValueError
        if actual_rating == other_better:
            total_correct += 1
num_zeros, total_correct, total_possible_ratings, conflict_ratings

(2746, 20234, 28360, 1551)

In [23]:
# compute the persistence diagram
dgm = ripser_parallel(all_vectors_np, maxdim=2, n_threads=8)

print("Processed dgm")
# comnvert to gtda format
dgm_gtda = _postprocess_diagrams([dgm["dgms"]], "ripser", (0, 1, 2), np.infty, True)[0]
print("dgm data")
# plot
plot_diagram(dgm_gtda, homology_dimensions=(0, 1, 2))

Processed dgm
dgm data


In [42]:
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import rbf_kernel

# Example data
X = all_vectors_np

# Normalize data
gamma = 0.1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# similarity_matrix = rbf_kernel(X_scaled, gamma=gamma)

# Apply spectral clustering
sc = SpectralClustering(n_clusters=3, affinity='rbf', gamma=gamma, assign_labels='cluster_qr')
labels = sc.fit_predict(X_scaled)

# Evaluate clustering quality
silhouette_avg = silhouette_score(X_scaled, labels)
davies_bouldin = davies_bouldin_score(X_scaled, labels)
calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
print(f"Silhouette Score: {silhouette_avg:.3f}")
print(f"Davies-Bouldin Index: {davies_bouldin:.3f}")
print(f"Calinski-Harabasz Index: {calinski_harabasz:.3f}")


Silhouette Score: 0.202
Davies-Bouldin Index: 1.665
Calinski-Harabasz Index: 1032.719
