# Data Clustering

We have high data dimensionality which can be an issue for ease of interpretability. Thus it would be convenient if we could group some features in diffent sub-categories. For example, cluster the genre in several different genre-representative.

- actors 
- characters
- genre

## Packages

In [182]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tqdm.notebook as tqdm
import re
from scipy import sparse

## Load Data

In [3]:
country_df = pd.read_pickle("../../data/post_processing//country_df.pkl")
comes_from_df = pd.read_pickle("../../data/post_processing/comes_from_df.pkl")
genre_df = pd.read_pickle("../../data/post_processing/genre_df.pkl")
is_of_type_df = pd.read_pickle("../../data/post_processing/is_of_type_df.pkl")
language_df = pd.read_pickle("../../data/post_processing/language_df.pkl")
spoken_languages_df = pd.read_pickle("../../data/post_processing/spoken_languages_df.pkl")
character_df = pd.read_pickle("../../data/post_processing/character_df.pkl")
actor_df = pd.read_pickle("../../data/post_processing/actor_df.pkl")
movie_df = pd.read_pickle("../../data/post_processing/movie_df.pkl")
belongs_to_df = pd.read_pickle("../../data/post_processing/belongs_to_df.pkl")
play_df = pd.read_pickle("../../data/post_processing/play_df.pkl")
appears_in_df = pd.read_pickle("../../data/post_processing/appears_in_df.pkl")
wikipedia_imdb_mapping_table = pd.read_pickle("../../data/generated/wikipedia_imdb_mapping_df.pkl")

## Genre Clustering

BOW creation Pipeline:
- casefolding
- remove stopwords
- add single words
- add bigrams

In [235]:
GENRE_ID_COL_NAME = "genre_name"
MOVIE_ID_COL_NAME = "movie_id"
custom_stopwords = {"movie","movies","film","films","cinema","&","and","in","of"}

def add_bigrams(words_list: list)-> list:
    """
    Append bigrams to the given words list.
    
    :param words_list: List words as strings.
    
    :return: Concatenated lists of both words and bigrams.
    
    """
    bigrams = [words_list[i]+" "+words_list[i+1] for i in range(len(words_list)-1)]
    return words_list+bigrams

def process_genre_name(name: str, stop_words: set) -> list:
    """
    Apply basic processing steps to the genre name:
        - Casefolding
        - Stopwords removal
        - Bigram addition
        
    :param name: String for the genre name
    :param stop_words: Set of words considered as stopwords.
    
    :return: List of processed words and bigrams.
    
    """
    words_list = [w.casefold() for w in re.split("\s|/|-", name) if w not in stop_words]
    words_with_bigrams = add_bigrams(words_list)
    return words_with_bigrams

def generate_vocabulary(genre_dataframe: pd.DataFrame, stop_words: set) -> set:
    """
    Generate the vocabulary out of the given genre DataFrame.
    
    :param genre_dataframe: Pandas DataFrame containing the data for the genre names.
    :param stop_words: Set of words considered as stopwords.
    
    :return: Set of words and bigrams contained in the vocabulary.
    
    """
    genre_name_df = genre_dataframe.reset_index()
    genre_name_df["words"] = genre_name_df[GENRE_ID_COL_NAME].apply(
            lambda name: process_genre_name(name,stop_words))
    vocabulary = set(genre_name_df["words"].aggregate(sum))
    return vocabulary

def generate_BOW_matrix(movie_genre_dataframe: pd.DataFrame, genre_dataframe: pd.DataFrame,
                        stop_words: set) -> tuple:
    """
    Generate the BOW matrix using the following pipeline:
        - Process genre names
        - Create vocabulary
        - Create BOW matrix 
        
    :param movie_genre_dataframe: Pandas DataFrame containing the association between genres and movies.
    :param genre_dataframe: Pandas DataFrame containing the data for the genre names.
    :param stop_words: Set of words considered as stopwords.
    
    :return: Return the vocabulary, the ordered list of movie ids for the BOW matrix, and the BOW matrix.
    
    """
    # Create Vocabulary.
    vocabulary = generate_vocabulary(genre_dataframe,stop_words)
    # Create one-hot encoding of words for each genre-movie pair.
    movie_genre_words_df = movie_genre_dataframe.copy()
    movie_genre_words_df[GENRE_ID_COL_NAME] = movie_genre_words_df[GENRE_ID_COL_NAME].apply(
            lambda name: process_genre_name(name,stop_words))
    movie_genre_words_df[GENRE_ID_COL_NAME] = movie_genre_words_df[GENRE_ID_COL_NAME].apply(
            lambda words_list: np.array([1 if w in set(words_list) else 0 for w in list(vocabulary)]))
    # Aggregates by summation into a BOW representation the different movies.
    grouped_movie_genre_df = movie_genre_words_df.groupby(MOVIE_ID_COL_NAME)[
            GENRE_ID_COL_NAME].apply(sum)
    grouped_movie_genre_df = grouped_movie_genre_df.sort_index()
    # Convert pandas Series to numpy array.
    BOW_matrix = grouped_movie_genre_df.reset_index()[GENRE_ID_COL_NAME].agg(
        np.concatenate).reshape(len(grouped_movie_genre_df),len(vocabulary))
    return vocabulary, grouped_movie_genre_df.index, BOW_matrix

In [225]:
vocabulary, movie_ids, BOW_matrix = generate_BOW_matrix(is_of_type_df, genre_df, custom_stopwords)