# Milestone 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json

## Load tables

In [None]:
df_plot = pd.read_csv("data/MovieSummaries/plot_summaries.txt", sep='\t', header=None, names=["wikiID", "plot"])

df_meta = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t', header=None, 
    names=["wikiID", "freeID", "name", "release_date", "revenue", "runtime", "languages", "countries", "genres"])

df_char = pd.read_csv("data/MovieSummaries/character.metadata.tsv", sep='\t',header=None, 
    names=["WikiID", "freeID", "release_date", "char_name", "actor_DOB", "actor_gender", "actor_height", "actor_ethnicity",
           "actor_name", "Actor_age", "freeID_char_map", "FreeID_char", "FreeID_actor"])

df_char_names = pd.read_csv('data/MovieSummaries/name.clusters.txt', sep="\t", header=None, 
    names=["char_name", "freeID_char_map"])

df_tropes = pd.read_csv('data/MovieSummaries/tvtropes.clusters.txt', sep='\t', header=None, names=["trope", "details"])

## Look at the data

In [None]:
df_plot.head()

In [None]:
df_plot.info()

In [None]:
df_meta.head()

In [None]:
df_meta.info()

In [None]:
df_char.head()

In [None]:
df_char.info()

In [None]:
df_char_names.head()

In [None]:
df_char_names.info()

In [None]:
df_tropes.head()

In [None]:
df_tropes = df_tropes.drop(["details"], axis=1).join(pd.json_normalize(df_tropes["details"].map(json.loads).tolist())).rename(
    columns={"id":"freeID_char_map"})

In [None]:
df_tropes.head()

In [None]:
df_tropes.info()

In [None]:
df_tropes.head()

## Join the dataframes

The feature connecting dataframes together is the Wikipedia ID. Also there are more metadatas of movies (81741 movies) than plots (42303 movies). We will only keep the metadatas of the movies we know the plot of.

In [None]:
df_full = df_meta.merge(df_plot, how='inner', on="wikiID")
df_full.head()

Only a small fraction of the characters have been labellised with a trope (500), compared to the number of unlabeled (450669). Here is the dataframe containing the characters with trope

In [None]:
df_inner_char = df_char.merge(df_tropes, how='inner', on='freeID_char_map')

In [None]:
df_inner_char.drop(columns=["char","actor"],inplace=True)
df_inner_char.head()

In [None]:
from xml.dom import minidom

# parse an xml file by name
file = minidom.parse('data/corenlp_plot_summaries/3217.txt.xml')
print(file)

## Exploratory data analysis

### Genres, languages and countries

In [None]:
def fcount(df):
    df2 = pd.DataFrame(data={
        'id': df.apply(lambda x: list(json.loads(x).keys())),
        'name': df.apply(lambda x: list(json.loads(x).values()))
    })
    
    distinctf = []
    counter = []

    for i in range(df2.shape[0]):
        fid = df2["id"].iloc[i]
        fname = df2["name"].iloc[i]
        for j in range(len(fid)):
            if [fid[j], fname[j]] not in distinctf:
                distinctf.append([fid[j], fname[j]])
                counter.append(1)
            else:
                counter[distinctf.index([fid[j], fname[j]])] += 1
                
    new = pd.DataFrame(data={
        'id': [s[0] for s in distinctf],
        'name': [s[1] for s in distinctf],
        'count': counter
    })
    
    return new.sort_values(by="count", ascending=False, ignore_index=True)

In [None]:
df_genre = fcount(df_meta["genres"])
df_genre.head()

In [None]:
df_lang = fcount(df_meta["languages"])
df_lang.head()

In [None]:
df_country = fcount(df_meta["countries"])
df_country.head()

### Release date distribution

In [None]:
df_meta['release_date'] = pd.to_datetime(df_meta['release_date'], errors = 'coerce')
df_meta.groupby(df_meta["release_date"].dt.year).count()['wikiID'].plot()
plt.show()

### Runtime distribution

In [None]:
print(df_meta['runtime'].max())

This is strange.

In [None]:
n_bins = 300

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_meta[df_meta['runtime'] < 300]['runtime'], bins=n_bins)
ax.set_title('Distribution of runtime')

plt.show()

### Distribution of number of words in plot description

In [None]:
#use raw plot_summaries and count nb of words in each plot 
df_plot_copy = df_plot.copy()
df_plot_copy['nb_words'] = df_plot_copy['plot'].apply(lambda n: len(n.split()))
df_plot_copy.head(3)

In [None]:
n_bins = 1000

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_plot_copy['nb_words'], bins=n_bins)
ax.set_title('Distribution of number of words per plot description')

plt.show()

Only keep the plots with less than 2000 words (which interval?)

In [None]:
threshold = 2000
df_plot_copy = df_plot_copy.loc[df_plot_copy['nb_words'] < threshold]

In [None]:
n_bins = 1000

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_plot_copy['nb_words'], bins=n_bins)
ax.set_title('Distribution of number of words per plot description')


plt.show()

What would be the minimum number of words to find senseful topic extraction?  

* characters by film distribition: how many characters have been labelised for each film?

In [None]:
df_char_copy = df_char[['freeID','FreeID_actor']].copy()
df_char_copy = df_char_copy.groupby(['freeID']).size().reset_index(name='counts')
df_char_copy.head(5)

In [None]:
n_bins = 130

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.hist(df_char_copy['counts'], bins=n_bins)
ax.set_title('Distribution of number of characters labelised per film')

plt.show()

In [None]:
n_bins = 200
red_square = dict(markerfacecolor='r', marker='s')

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.boxplot(df_char_copy['counts'],vert=False, flierprops=red_square)
ax.set_title('Number of characters labelised per film')
plt.show()
print("1rst quartile, median and 3rd quartile values: ")
print(df_char_copy['counts'].quantile([0.25,0.5,0.75]))


"Hemingway & Gellhorn" has more than 115 characters labelised! 

In [None]:
df_char_copy.loc[df_char_copy['counts'] > 100].head(10)

Preprocessing of metadata dataset : remove films without country, language, genre, runtime, release_date. 
Also remove films with less than nb_min_actors (=1) labeled actors.  

In [None]:
nb_min_actors = 1
#remove Nans
df_char_noNaN = df_char.loc[df_char['char_name'].isna() | (df_char['actor_name'].isna())]
#group by movies the nb of actors labelised by movie
df_clean_char_noNan = df_char_noNaN.groupby(['freeID']).size().reset_index(name='actor count')
#remove the films with less than nb_min_actors
df_clean_char = df_clean_char_noNan.loc[df_clean_char_noNan['actor count'] > nb_min_actors]

print('number of movies remaining after character dataset preprocessing: ', df_clean_char.shape[0])

In [None]:
df_inner_meta = df_meta.merge(df_clean_char, how='inner', on='freeID')

df_clean = df_inner_meta.loc[(df_inner_meta['countries'] != '{}') & (df_inner_meta['languages'] != '{}') & (df_inner_meta['genres'] != '{}') &
            df_inner_meta['release_date'].notnull()].reset_index()
    
print('number of movies remaining after movie metadata preprocessing: ', df_clean.shape[0])

In [None]:
df_clean = df_clean.merge(df_plot, how='inner', on='wikiID')

print('number of movies remaining after movie metadata preprocessing: ', df_clean.shape[0])

In [None]:
df_clean.info()

By observing the non-null counts, on the runtime and revenue columns, we decided to eliminate these attributes. 

In [None]:
df_clean.drop(columns=['revenue','runtime','index'], inplace=True)

Right formatting for next data analysis steps:

In [None]:
df_clean['countries'] = pd.DataFrame(data={
        #'id': df_clean['countries'].apply(lambda x: list(json.loads(x).keys())),
        'countries': df_clean['countries'].apply(lambda x: list(json.loads(x).values()))
    })
df_clean['languages'] = pd.DataFrame(data={
        #'id': df_clean['countries'].apply(lambda x: list(json.loads(x).keys())),
        'languages': df_clean['languages'].apply(lambda x: list(json.loads(x).values()))
    })
df_clean['genres'] = pd.DataFrame(data={
        #'id': df_clean['countries'].apply(lambda x: list(json.loads(x).keys())),
        'genres': df_clean['genres'].apply(lambda x: list(json.loads(x).values()))
    })    

Preprocessed dataset:

In [None]:
df_clean.info()

Every columns has the same number of non-null values, and there isn't any null value inside of it. 

## CoreNLP

In [None]:
import gzip
import os

path = "data/corenlp_plot_summaries/"
files = os.listdir(path)
print(files[0])
f = gzip.open(path+files[0], 'rb')

test = f.read().decode()
# print(test)

In [None]:
from bs4 import BeautifulSoup

Bs_data = BeautifulSoup(test, "xml")
 
print(Bs_data.find("sentence", {'id':'20'}).find('NER').text)

## Choice of attributes based on data exploratory analysis

attributs scraped:
-Director 
-Color
-Runtime to complete dataset?
 
enlever attribut du dataset final:
- Revenu 

filtré?
- Title
- Plot existant + taille plot
- Genre

non filtré? 
- Runtime:  remplacer valeur abérente par non mais pas filtré
- Release date: remplacer valeur abérente par non mais pas filtré
- Revenu
- Language
- Country
- Actors
- Characters
- Director
- Color

retenu pour similarité? 
-Plot
-Title
-Runtime: long metrage/court metrage qui remplace run time
-Release date: year
-Country (petit poids)
-Language (petit poids)
-Actors
-Characters
-Genres
-Director
-Color (petit poids)

pas retenu pour similarité? 
- Revenu

## Methods to study similarities accross chosen attributes

### 1. Mathematical similarity definition

#### 1.1 Cosine similarity
$$
cosine \: similarity(A,B)=S_c(A,B)=cos(\theta)=\frac{A.B}{\|A\| \|B\|}=\frac{\sum_{i=1}^{n}A_iB_i}{\sqrt{\sum_{i=1}^{n} A_i}\sqrt{\sum_{i=1}^{n} B_i}}
$$
Where $A,B \in \mathbb{R}^n$, $S_c(A,B) \in [-1,1]$ where -1 means that the two vectors are exactly opposite, and 1 means that they are exactly similar and 0 means that they are orthognonal which shows decorrelation.

#### 1.2 Centered Cosine similarity
$$
centered \: cosine \: similarity(A,B)=\frac{(A-\overline{A}).(B-\overline{B})}{\|A-\overline{A}\| \|B-\overline{B}\|}
$$
Where A and B have been normalized before by substracting their mean.

#### 1.3 Associated distance with cosine similarity
- **Angular distance**

if $A_i,B_i \in \mathbb{R}$ 
$$
angular \: distance=D_\theta=\frac{arccos(S_c(A,B))}{\pi}=\frac{\theta}{\pi}
$$
$$
angular \: similarity=S_\theta=1-D_\theta=1-\frac{\theta}{\pi}
$$
if $A_i,B_i \in \mathbb{R}$ and $A_i,B_i\geq 0$
$$
angular \: distance=D_\theta=\frac{2.arccos(S_c(A,B))}{\pi}=\frac{2\theta}{\pi}
$$
$$
angular \: similarity=S_\theta=1-D_\theta=1-\frac{2\theta}{\pi}
$$
Where the angular distacne is a formal distance metric, however the arccos computation cost makes it more computationally expensive and slower.

- **Cosine distance**

$$
cosine \: distance=D_c=1-S_c(A,B)
$$
Where the cosine distance is an unformal distance metric (it does not respect the triangle inequality or Schwarz inequality) but it is less computationally expensive.


- **L2-normalized Euclidean distance**

From the L2 distance defined as followed: $ \|x\|_2=\sqrt{\sum x_i^2}=\sqrt{x.x} $ and the euclidean distance defined as followed: 
$ d(A,B)=|A-B|=\sqrt{\sum _{i=1}^{n} (A_i-B_i)^2 } $, we get the L2-normalized Euclidean distance:
$$
L2-normalized \: Euclidean \: distance=\sqrt{\sum _{i=1}^{n} (A_i'-B_i')^2} \quad \textrm{where} \quad A'=\frac{A}{\|A\|_2}
$$

The cosine similarity and associated distances reflects relative rather than absolute comparison of vectors. For example vectors $A$ and $\alpha A$ where $\alpha \in \mathbb{R}$ are maximally similar. Therefore this similarity is appropriate for data where frequency is more important than absolute value. For text comparison it can be very useful, we could compare the frequency of terms in a document.  

#### 1.4 Soft cosine similarity
$$
soft \: cosine \: similarity(A,B)=\frac{\sum_{i,j}^{n}s_{ij}A_iB_j}{\sqrt{\sum_{i,j}^{n} s_{ij}A_iA_j}\sqrt{\sum_{i,j}^{n} s_{ij}B_iB_j}}
$$
where $s_{ij}$=similarity($feature_i$,$feature_j$). For example if $s_{ii}$=1 and $s_{ij}$=0 $\forall i\neq j$ then there is no similarity between features, then the soft cosine similarity is equal to the cosine similarity. In the case where features are words, the matrix $S$ has to define the similarity between words.

### 2. Algorithm to study similarity between movies attributes

#### tf-idf : Term frequency-inverse document frequency
**Term frequency and Inverse document frequency**

$$
tf(t,d)=\frac{f_{t,d}}{ \sum_{t' \in d} f_{t',d} } \quad \textrm{and} \quad idf(t,D)=log(\frac{N}{|d\in D:t\in d|})
$$
where $f_{t,d}$ = raw frequency = number of times a term $t$ occurs in document $d$, $ \sum_{t' \in d} f_{t',d}$=total number of terms $t'$ in $d$ by suming each independent occurrence
<br>
where $N$=$|D|$= number of documents in corpus $D$, $|d\in D:t\in d|$= number of documents where t appears (where $tf(t,d)\neq 0$)

To prevent bias towards longer documents, the term frequency can be computed as follow:
$$
tf(t,d)=0.5+0.5\frac{f_{t,d}}{max(f_{t',d}:t'\in d)}
$$
To avoid divinding by zero, the idf denominator can ba adjusted as follow:
$$
idf(t,D)=log(\frac{N}{1+|d\in D:t\in d|})
$$
**Term frequency-inverse document frequency**

From the term frequency and the inverse document frequency we can compute the tf-idf:
$$
tf-idf(t,D)=tf(t,d).idf(t,D)
$$

The tf-idf reflects how important a word is to a document in corpus. tf-idf increase proportionally with the number of times a word appear in a document, but it is offset by the number of documents in the corpus that contains the word. It is high when there is high term frequency in document d **and** low document frequency of the term in the whole corpus. This helps to adjust the fact that some words appear more frequently that we can define as common terms. It tends to filter out those common words.

**Implementation**

tfidf can be implemented using the `GenSim` opensource library. After obtaining tf-idf associated for each word in each text, we can compute the cosine similarity between those texts. Since we compare to what extent different movies contain the exact same words, we need to add a lemmatizer filtering to take care of the same words being in the singular or plural form.
xxx

**Movie Attribute similarity that can be studied**
- Movie genre
- Movie Language ¨
- Actor
- Characters 
- Movie Country??? xxx
- Director??? xxx 

Here we can use this method for these attributes because we want to find which of those attributes contain the exact same words and study the similarity based on this analysis. 

#### GloVe algorithm
**Definition**
The `GloVe` algorithm is an opensource standford algorithm that associates similarity between words by obtaining vectors representations for words and by mapping them into a meaningful space.

**Implementation** 
We can apply soft cosine similarity computation to study similarity between texts. The matrix S from the soft cosine definition can be calculated with the library `GenSim` using the `GloVe` algorithm. We would obtain a dataset containing a vector for all the words in our corpus and we could compute the matrix S associating a similarity matrix to all of these words. Then with the soft cosine similarity we could analyse the similarity between the documents in our corpus. 

**Movie Attribute similarity that can be studied**
- Title

#### Preprocessing method on a document corpus before topic extraction

Here is a preprocessing approach that can be implemented before using a topic extraction analysis
1. Removing the stop words from the documents which are the most common words occuring in texts that give no additional concept. It can be done with `Java` using `MySQL`.
2. Removing the numbers appart from years, the non-asci characters, and most common occuring names (ex: James, Robert, John)
3. Handle pural and singular form of the same word by lemmatizing.
4. We can filter the words using tf-idf. We can compute the tf-idf for each word of each plot and keep the words with highest tf-idf score. The threshold for a word to be kept has to be defined.

#### LDA: Latent Dirichlet Allocation

<div>
<img src="images/LDA_example.png" width="800"/>
</div>

**Definition**

LDA is an algorithm that can be used for topic extraction in texts. It is an unsupervised machine-learning model that takes documents as input and finds topics as output. A topic is represented as a weighted list of words. The model also says in what percentage each document talks about each topic. 

**Implementation**

LDA can be implemented using the `GenSim` library. When using LDA to analyse topics in a corpus, it needs some preprocessing steps before applying the algorithm to make it more efficient. We detailed one preprocessing approach above.


After this processing we can apply LDA on our database by tuning some parameters:
<br>
$K$: the number of topics we look for
<br>
$\alpha$: K-dimension vector of positive reals that represent the prior weights of topic K in a document which affects the document-topic distribution. 
<br>
$\eta$: V-dimension vector of positive reals that represents the prior weights of each words in topics which affects the topic-word distribution

If we chose a symetric LDA, the weights $\alpha$ would be the same for all topics and the weights $\eta$ would be the same for all words in a topic. The smaller the $\alpha$ the fewer topics per document, the fewer the $\eta$ the fewer words per topic.

**Movie Attribute similarity that can be studied**
- Plot

#### Doc2Vec topic extraction
**Definition**

Doc2Vec is an unsupervised algorithm that learns fixed-length feature vectors for paragraphs/documents/texts. Then we can compare these vectors to assess the similarity between documents. Doc2vec allows to generate a semantic space which is a spatial space where distance among vectors are indicator of semantic similarity. This semantic space consisting of word and document vectors is a continuous representation of topics, unlike LDA where topics are sampled from a discrete space. It means that the dense areas having high concentration of document can be thought of having similar topics and can be best represented by nearby embedded words.

**Implementation**

It can be implemented using the `GenSim` library with the class `Doc2Vec` that extends the class `Word2Vec`. 


**Movie Attribute similarity that can be studied**
- Plot

#### Release year similarity algorithm to add xxx

### 3. Assessing the global similarity between movies taking into account all attributes

#### Finding weights between attributes, to make the global similarity depending more or less on specific attributes

xxx

#### Method to merge all attributes similarities together

xxx

### 4. Tuning the weights using sequels

How to tell if our similarity function between two movies is working well? First of all, we can't say it for sure, since it is a subjective question. However usually we can still agree on these points:

- Movies that are part of a sequel should have a high similarity
- Movies with completely different genres should have a low similarity (for exmaple one is a "Adventure/Aciton" movie and the other a "Romantic Comedy")

Hence we can build two datasets, one of pairs of movies we expect to have a high similarity and the other of pairs of movies we expect to have low similarity. We can then use these datasets to assess if our similarity function is giving the "right" values.

**Movies from sequels**

Before creating pairs of movies that are similar we should group movies by sequels. Unfortunately, we don't have this data directly accessible in our dataset, but by using the `name.clusters.txt` file we can have characters that are re-used. It doesn't necesseraly mean that the movie is a sequel (for example with Sherlock Holmes it can be the another representation of him playing), but we still expect these movies to be similar.

First, using the `character.metadata.tsv` and the `freeID_char_map` given for each character, we find the movie in which the character is played.

In [None]:
df_char_names["WikiID"] = [ df_char[df_char['freeID_char_map'] == x]["WikiID"].item() for x in df_char_names["freeID_char_map"]]

In [None]:
df_char_grouped = df_char_names.groupby("char_name")["WikiID"].apply(set).to_frame()

In [None]:
starting_list = []
# Iterate over each row
for index, rows in df_char_grouped.iterrows():
    # append the list to the final list
    starting_list.append(rows["WikiID"])
print(starting_list)

In [None]:
joined_characters = []
same = False

while not(same):    
    for i in starting_list:
        joined = False
        for j in range(len(joined_characters)):
            if joined_characters[j] & i:
                joined_characters[j] = joined_characters[j] | i
                joined = True
        if joined == False:
            joined_characters.append(i)
    #print(starting_list)
    #print(joined_characters)
    if len(starting_list) == len(joined_characters):
        same = True
    else:
        starting_list = joined_characters
        joined_characters = []

print(len(joined_characters))

Find movies from freeId_char_map using the `character.metadata.tsv` dataset.

In [None]:
df_char_grouped = df_char_names.groupby("char_name")

In [None]:
df_char_grouped.first()
#df_char_names.head()
#iterate over characters
#iterate over char_ids
#find movie wikiID for a char_ids

Find movies from freeId_char_map

if movie has no plot, remove it from group

if group has only one movie (because of removed before), remove it

**Movies with different genres**

## Visualization method

### t-SNE
**Definition**

**Implementation**

`scikit-learn` library

### Other??

## User Recommandation method

### k-Nearest Neighbors
**Definition**

**Implementation**

`scikit-learn` library
<br>
tensorboard associated with `TensorFlow` library

### Other???