## Deep Watching


In [None]:
from collections import defaultdict
import json
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
# SET THE FILE HERE
FILE = "out-bandera_test_best.json"

In [None]:
with open(FILE, 'r') as fi:
    data = json.load(fi)

In [None]:
# available classes

CLASSES = [
    "georgsband", 
    "hakenkreuz",
    "hammer&sichel",
    "adler_wappen",
    "ukraine",
    "flagge_ukr",
    "falanga",
    "flagge_upa",
    "oun",
    "swoboda",
    "ss_rune",
    "Wolfsangel"
]

CLASSES_EN = [
    "st. george", 
    "swastika",
    "hammer&sickle",
    "eagle",
    "ukraine",
    "flag_ukr",
    "falanga",
    "flag_upa",
    "oun",
    "swoboda",
    "ss",
    "wolfsangel"
]

# Absolute Count of found Labels

In [None]:
absolute_counts = pd.DataFrame(columns=[*CLASSES])

for video_id, frames in data.items():
    counts = defaultdict(int)
    for frame in frames:
        for obj in frame:
            if obj['label'] in CLASSES:
                 counts[obj['label']] += 1
    new_row_data = {
        label: count for label, count in counts.items()
    }
    
    new_row = pd.Series(data=new_row_data, name=video_id)
    
    absolute_counts = absolute_counts.append(new_row)

absolute_counts = absolute_counts.fillna(value=0)
# rename symbols
absolute_counts.columns=CLASSES_EN
#absolute_counts.mean(axis=0).plot.bar()
absolute_counts.sort_values(by=['ss'], ascending=False)

In [None]:
for video_id, row in absolute_counts.iterrows():
    fig, ax = plt.subplots(figsize=(12, 8))
    values = [(label, count) for label, count in row.items()]
    
    x, y = zip(*values)
    
    x_vals = np.arange(len(x))
    
    ax.bar(
        x = x_vals,
        height=y,
        width=0.5,
    )
    
    ax.set_xlabel(f"Class Name")
    ax.set_ylabel(f"Absolute Count")
    ax.set_title(f"{video_id}")
    
    ax.set_xticks(x_vals)
    ax.set_xticklabels(x, rotation=15, ha='right')
    
    ax.set_yscale('linear')
    
    plt.show()
    plt.close()

# Mean Percentage of Visible Labels

In [None]:
percentages = pd.DataFrame(columns=[*CLASSES])

for video_id, frames in data.items():
    percs = defaultdict(list)
    for frame in frames:
        for obj in frame:
            if obj['label'] in CLASSES:
                percs[obj['label']].append(obj['percentage'])
    new_row_data = {
        label: np.mean(perc) for label, perc in percs.items()
    }
    
    new_row = pd.Series(data=new_row_data, name=video_id)

    percentages = percentages.append(new_row)

percentages = percentages.fillna(value=0.0)
# rename symbols
percentages.columns=CLASSES_EN
#bar plot

# make sure colors don't repeat
colors = [plt.cm.tab20(i) for i in np.linspace(0, 1, len(CLASSES))]
percentages.mean(axis=0).plot.bar(figsize=(14,10),color=colors)
percentages.sort_values(by='st. george', ascending=False)

In [None]:
for video_id, row in percentages.iterrows():
    fig, ax = plt.subplots(figsize=(12, 8))
    values = [(label, mean_perc) for label, mean_perc in row.items()]
    
    x, y = zip(*values)
    
    x_vals = np.arange(len(x))
    
    ax.bar(
        x = x_vals,
        height=y,
        width=0.5,
    )
    
    ax.set_xlabel(f"Class Name")
    ax.set_ylabel(f"Mean visibile percentage")
    ax.set_title(f"{video_id}")
    
    ax.set_xticks(x_vals)
    ax.set_xticklabels(x, rotation=15, ha='right')
    
    plt.show()
    plt.close()

# Found labels per frame

In [None]:
from matplotlib.ticker import MaxNLocator

In [None]:
for video_id, frames in data.items():
    per_label = defaultdict(list)
    for frame_count, frame in enumerate(frames, start=1):
        for obj in frame:
            if obj['label'] in CLASSES:
                per_label[obj['label']].append((frame_count, obj['percentage']))
    
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # make sure colors don't repeat
    ax.set_prop_cycle('color',plt.cm.tab20(np.linspace(0, 1, len(CLASSES))))

    x_vals = np.arange(len(frames))

    for label, frame in per_label.items():
        x, y = zip(*frame)
        ax.plot(x, y, 'o', label=label)

    ax.set_xlabel(f"Frame Number")
    ax.set_ylabel(f"Percentage within frame")
    ax.set_title(f"{video_id}")
    ax.legend()

    #ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))

    #ax.set_xticks(x_vals)
    plt.ylim(0, 1)

    plt.show()
    plt.close() 

## Symbols per Frame vs Frame Difference

In [None]:
# load frame differences
vid_metadata = pd.read_json("all_vids.json", orient="split")

for video_id, frames in data.items():
    per_label = defaultdict(list)
    for frame_count, frame in enumerate(frames, start=1):
        for obj in frame:
            if obj['label'] in CLASSES:
                per_label[obj['label']].append((frame_count, obj['percentage']))
                
    diffs = vid_metadata.loc[vid_metadata['id'] == video_id]['frame_diffs'].tolist()[0]
    
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # make sure colors don't repeat
    colormap = plt.cm.tab20
    # ax.set_color_cycle([colormap(i) for i in np.linspace(0, 1, len(CLASSES))])
    x_vals = np.arange(len(frames))
    
    # plot frame diffs
    ax.plot(range(len(diffs)), diffs)
    
    for label, frame in per_label.items():
        x, y = zip(*frame)
        ax.plot(x, y, 'o', label=label)

    ax.set_xlabel(f"Frame Number")
    ax.set_ylabel(f"Percentage within frame")
    ax.set_title(f"{video_id}")
    ax.legend(loc=1)
    # ax.legend_.remove()

    plt.ylim(-0.01, 1.01)

    plt.show()
    plt.close() 

In [None]:
vid_metadata = pd.read_json("all_vids.json", orient="split")
total = 0
#calc duration
for video_id in data:
    duration = vid_metadata.loc[vid_metadata['id'] == video_id]['duration'].to_string().split(" ")[-1]
    total += float(duration.split(":")[2])+60*float(duration.split(":")[1])+3600*float(duration.split(":")[0])
print(total, total/3600, total%3600/60, total%3600%60)
print(len(data))

## Corpus Map using Mean Percentage

In [None]:
from sklearn.manifold import Isomap
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# plot Isomap
def plot(data, title="Isomap", clrs=None, max_val=0, words=None, labels=None):
    colors = ['lightblue', 'lightgreen', 'orange', 'red', 'darkblue', 'darkgreen', 'violet', 'purple', 'hotpink', 'lemonchiffon', 'paleturquoise',  'greenyellow','pink', 'magenta', 'darksalmon']
    if clrs is None:
        clrs = ['blue']*len(data)
    if labels is None:
        labels = [""]*len(data)
    if words is None:
        words = [250]*len(data)   
    fig = plt.figure(figsize=(32, 24))
    ax1 = fig.add_subplot(111)
    matplotlib.rcParams.update({'font.size': 20})
    plt.title(title)
    plt.xlim(-0.15, 0.075)
    plt.ylim(-0.15, 0.16)
    ax1.tick_params(axis='both', which='both', bottom='off', top='off', left='off', right='off')
    for counter in range(len(data)):
        scatter = ax1.scatter(data[counter][0], data[counter][1], s=words[counter], marker='o', c=colors[clrs[counter]], 
                              vmin=0, vmax=max_val)
        label_scale = words[counter]/(70*2500)
        plt.annotate(labels[counter].split(":")[-1], xy=(data[counter][0]+label_scale, data[counter][1]+label_scale))
    plt.tight_layout()
    plt.show()

# cluster bodies and faces
cluster_num = 12

matrix = percentages.as_matrix()
km = KMeans(n_clusters=cluster_num, random_state=5)
clusters = km.fit_predict(matrix)
print("Silhouette score:", silhouette_score(matrix, clusters))

# plot corpus map using Isomap
imap = Isomap()
corpus_map = imap.fit_transform(matrix)
plot(corpus_map, "Corpus Map",  max_val=cluster_num, clrs=clusters, labels = percentages.index)

In [None]:
## absolute count over time

dates = []

for video_id, row in absolute_counts.iterrows():
    dates.append(pd.to_datetime(str(vid_metadata.loc[vid_metadata['id'] == video_id]['date']).split(" ")[-4].split("\n")[0], format="%Y-%m-%d"))
absolute_counts['date']=dates
new_counts = absolute_counts.copy()
new_counts.index = new_counts['date']
del new_counts['date']

# make sure colors don't repeat
colors = [plt.cm.tab20(i) for i in np.linspace(0, 1, len(CLASSES))]

new_counts = new_counts.groupby(pd.Grouper(freq='M')).sum()
new_counts.replace(0, np.nan).plot(figsize=(14,10), color=colors, marker='o', linestyle="")

for column in new_counts.columns:
    new_counts.plot(y=column)

In [None]:
# percentage over time

dates = []

for video_id, row in percentages.iterrows():
    dates.append(pd.to_datetime(str(vid_metadata.loc[vid_metadata['id'] == video_id]['date']).split(" ")[-4].split("\n")[0], format="%Y-%m-%d"))
percentages['date']=dates
new_percs = percentages.copy()
new_percs.index = new_percs['date']
del new_percs['date']

# make sure colors don't repeat
colors = [plt.cm.tab20(i) for i in np.linspace(0, 1, len(CLASSES))]

# group by month
new_percs = new_percs.groupby(pd.Grouper(freq='M')).mean()

# bo = new_percs.sort_values(by='falanga', ascending=False)
new_percs.columns = CLASSES_EN

ax = new_percs.plot(figsize=(14,10), color=colors, marker='o', linestyle="")
#fig = ax.get_figure()
#fig.savefig('symbols_over_time.png')

new_percs.plot(figsize=(14,10),marker='o', linestyle="",y=['flag_upa', 'flag_ukr', 'ukraine'])

for column in new_percs.columns:
    new_percs.replace(np.nan, 0).plot(figsize=(14,10),y=column)

In [None]:
import matplotlib.patches as patches

# plot symbol locations
boxes = pd.DataFrame(columns=[*CLASSES])

for video_id, frames in data.items():
    percs = defaultdict(list)
    for frame in frames:
        for obj in frame:
            if obj['label'] in CLASSES:
                percs[obj['label']].append(obj['bbox'])
    new_row_data = {
        label: perc for label, perc in percs.items()
    }
    
    new_row = pd.Series(data=new_row_data, name=video_id)
    
    boxes = boxes.append(new_row)
    
boxes = boxes.fillna(value=0.0)

for column in CLASSES:

    fig1 = plt.figure(figsize=(16,9))
    ax1 = fig1.add_subplot(111)
    ax1.set_title(column)
    plt.gca().invert_yaxis()
    for count, box in enumerate(boxes[column]):
        width = vid_metadata.loc[vid_metadata['id']==boxes.index[count]]['width']
        height = vid_metadata.loc[vid_metadata['id']==boxes.index[count]]['height']
        try:
            for x1, y1, x2, y2 in box:
                x1 /= float(width)
                x2 /= float(width)
                y1 /= float(height)
                y2 /= float(height)
                ax1.add_patch(patches.Rectangle((x1, y1), x2-x1, y2-y1, alpha=0.03))
        except TypeError:
            continue
        plt.show()
        plt.close()

In [None]:
# symbol cooccurrences
import matplotlib.colors as clrs
cooc = np.zeros(shape=(len(CLASSES), len(CLASSES)))

for video_id, frames in data.items():
    counts = defaultdict(int)
    for frame in frames:
        objects = []
        for obj in frame:
            if obj['label'] in CLASSES:
                objects.append(CLASSES.index(obj['label']))
        for a in objects:
            for b in objects:
                if a is not b:
                    cooc[a, b] += 1 
                    
fig, ax = plt.subplots(figsize=(18,10))
ax.grid(False)
plt.imshow(cooc, norm=clrs.LogNorm(vmin=0.1, vmax=cooc.max()),)
plt.colorbar()
plt.xticks(range(len(CLASSES)), CLASSES_EN, rotation=90)
plt.yticks(range(len(CLASSES)), CLASSES_EN)
plt.show()
                    
# adjust for total occurrences
for row in range(len(cooc)):
    for col in range(len(cooc[row])):
        abs = absolute_counts.sum()[absolute_counts.sum().index[row]]
        if abs > 0:
            cooc[row][col] = cooc[row][col] / absolute_counts.sum()[absolute_counts.sum().index[row]]

# plot it
# mask = np.zeros_like(cooc)
# mask[np.triu_indices_from(mask)] = True
# cooc = np.ma.masked_array(cooc, mask=mask)
fig, ax = plt.subplots(figsize=(18,10))
ax.grid(False)
plt.imshow(cooc, norm=clrs.LogNorm(vmin=0.1, vmax=cooc.max()),)
plt.colorbar()
plt.xticks(range(len(CLASSES)), CLASSES_EN, rotation=90)
plt.yticks(range(len(CLASSES)), CLASSES_EN)
plt.show()