In [None]:
import os
import mechanicalsoup
import requests
import collections
import pandas as pd
from bs4 import BeautifulSoup
import feedparser

START_URL = 'http://www.imsdb.com'
RSS_GENRE_LINK = 'http://www.imsdb.com/feeds/genre.php?genre={}'

In [None]:
b = mechanicalsoup.Browser()
page = b.get(START_URL)

In [None]:
links_alphabetical = page.soup.select('a[href^="/alphabetical"]')
links_genres = page.soup.select('a[href^="/genre"]')

In [None]:
data = collections.defaultdict(lambda: [])
for link in links_genres:
    url = START_URL + link.attrs['href']
    genre = link.text
    print('Retrieving genre: {}'.format(genre))
    data['genre'].append(genre)
    data['url'].append(url)
df_genres = pd.DataFrame(data)

In [None]:
data = []
for genre, df_ in df_genres.groupby('genre'):
    url = RSS_GENRE_LINK.format(genre).lower()
    result = feedparser.parse(url)
    entries = result['entries']
    for entry in entries:
        entry['genre'] = genre
        data.append(entry)

df_scripts = pd.DataFrame(data)
df_scripts.head()

In [None]:
columns = [x for x in df_scripts.columns if not x.endswith('_detail') and x != 'links']
df_scripts = df_scripts[columns]

In [None]:
data = collections.Counter()
for link, df_ in df_scripts.groupby('link'):
    data[len(df_.genre.unique())] += 1
pd.DataFrame(list(data.items()), columns = ['genre_count', 'occurrences']).set_index('genre_count').sort_index()

In [None]:
genres = []
for link, df_ in df_scripts.groupby('link'):
    genres.append(df_.genre.unique())
genres = [sorted(x) for x in genres]

In [None]:
import scipy.sparse
import numpy as np
import matplotlib.pyplot as plt

def get_flattened_list(l):
    o = []
    for x in l: o += list(x)
    return o

def co_occurrences(els):
    unique_elements = sorted(set(get_flattened_list(els)))
    tag_2_idx = {t: idx for idx, t in enumerate(unique_elements)}
    idx_2_tag = {idx: t for t, idx in tag_2_idx.items()}
    data = []
    rows = []
    cols = []
    for tags in els:
        assert len(tags)
        for i, tag1 in enumerate(tags[:-1]):
            tag1_idx = tag_2_idx[tag1]
            for tag2 in tags[i + 1:]:
                tag2_idx = tag_2_idx[tag2]
                data.append(1)
                rows.append(tag1_idx)
                cols.append(tag2_idx)
    coo_mat = scipy.sparse.coo_matrix((data, (rows, cols)), shape=(len(unique_elements), len(unique_elements))).todense()
    coo_mat = np.maximum(coo_mat, coo_mat.T)
    return coo_mat, tag_2_idx, idx_2_tag

coo_mat, tag_2_idx, idx_2_tag = co_occurrences(genres)

## Most common multi genre combinations

In [None]:
df_ = pd.DataFrame(list(collections.Counter([','.join(x) for x in genres]).items()), columns=['genres', 'occurrences']).set_index('genres').sort_values('occurrences', ascending=False)
df_[df_.index.map(lambda x: x.count(',') > 1)].head(10)

## Correlations between genres

In [None]:
fig, ax = plt.subplots(figsize = (10, 10))
img = ax.imshow(np.tril(coo_mat), cmap=plt.get_cmap('magma_r'))
labels = sorted(tag_2_idx.keys(), key=lambda x: tag_2_idx[x])
label_indices = list(range(len(labels)))
ax.set_xticks(label_indices)
ax.set_xticklabels(labels, rotation=45)

ax.set_yticks(label_indices)
ax.set_yticklabels(labels)
fig.colorbar(img)
fig.tight_layout()

## Scripts per genre

In [None]:
df_genre_occurrences = df_scripts.genre.value_counts()
genre_occurrences = [df_genre_occurrences[label] for label in labels]
fig, ax = plt.subplots(figsize=(16, 5))
df_genre_occurrences.to_frame().sort_index().genre.plot(kind='bar', ax=ax)
fig.tight_layout()

## Correlations per occurrenes per genre

In [None]:
correlations = np.asarray(np.sum(coo_mat, axis=0))[0]
fig, ax = plt.subplots(figsize = (16, 5))
df_ = pd.DataFrame(list(zip(genre_occurrences, correlations, labels)), columns=['occurrences', 'correlations', 'label']).set_index('label')
df_['relative_correlation'] = df_.correlations / df_.occurrences
df_.relative_correlation.plot(kind='bar', ax=ax)
fig.tight_layout()

## Create single labels for multi-genre elements

In [None]:
import sklearn
import sklearn.cluster

def get_genre_vectors(genres):
    data, rows, cols = [], [], []
    for idx, tags in enumerate(genres):
        data += [1] * len(tags)
        rows += [idx] * len(tags)
        cols += [tag_2_idx[tag] for tag in tags]
    return scipy.sparse.coo_matrix((data, (rows, cols)), shape=(len(genres), len(tag_2_idx.keys()))).todense()

document_genre_vectors = get_genre_vectors(genres)

NUM_CLUSTERS=4
#Birch, FeatureAgglomeration, KMeans, MiniBatchKMeans
clf = sklearn.cluster.Birch(n_clusters=NUM_CLUSTERS)
clf.fit(document_genre_vectors)
X = clf.transform(document_genre_vectors)
Y = clf.predict(document_genre_vectors)

fig, ax = plt.subplots(figsize=(10, 3))
pd.Series(np.bincount(Y)).plot(kind='bar', ax=ax)
ax.set_xlabel('New label')
ax.set_ylabel('# documents')

In [None]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X, Y, stratify = Y)
classifier = sklearn.svm.LinearSVC()
classifier.fit(X_train, Y_train)
Y_train_pred = classifier.predict(X_train)
Y_test_pred = classifier.predict(X_test)
for true, pred in [(Y_train, Y_train_pred), (Y_test, Y_test_pred)]:
    print(len(true), sklearn.metrics.f1_score(true, pred, average='macro'))

In [None]:
import sklearn.manifold

tsne = sklearn.manifold.TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)

In [None]:
cmap = plt.get_cmap('Set1')
x, y = X_tsne[:,0], X_tsne[:,1]
colors = [cmap.colors[new_label] for new_label in Y]
fig, ax = plt.subplots(figsize=(10, 10))
ax.grid('off')

for label in range(len(set(Y))):
    color = cmap.colors[label]
    ax.scatter(x[Y == label], y[Y == label], c=color, label=label)
ax.legend()
ax.set_xticks([])
ax.set_yticks([]);

In [None]:
def get_samples_for_classes(genres, new_labels, num_samples=5):
    num_labels = len(set(new_labels))
    genres = np.array(genres)
    samples = []
    for i in range(num_labels):
        elements = np.where(new_labels == i)[0]
        choice = np.random.choice(elements, size=num_samples)
        samples.append(genres[choice])
    return samples
    
for i, samples in enumerate(get_samples_for_classes(genres, Y)):
    print('New class:')
    for sample in samples:
        print('\t' + ''.join(['{:15}'.format(x) for x in sample]))
        