In [514]:
import json

def jsonToDic(file_path):
    '''
    Entrée : chemin vers un fichier (str)
    Sortie : dictionnaire du contenu du fichier (key = Title, value = Plot)
    '''
    with open(file_path) as json_file:
        data = json.load(json_file)
        dic = {}
        for movie in data:
            title = movie['Title']
            plot = movie['Plot']
            dic[title] = plot
        return dic

file_path = ("/Users/chloe/Documents/toto/films.json")
dico_films = jsonToDic(file_path)

# afficher le dictionnaire des films
print(dico_films)

# afficher le résumé d'un film spécifique
print(dico_films["The Shawshank Redemption"])


{'The Shawshank Redemption': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.', 'The Godfather': 'The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.', 'The Godfather: Part II': 'The early life and career of Vito Corleone in 1920s New York is portrayed while his son, Michael, expands and tightens his grip on the family crime syndicate.', 'The Dark Knight': 'When the menace known as the Joker emerges from his mysterious past, he wreaks havoc and chaos on the people of Gotham, the Dark Knight must accept one of the greatest psychological and physical tests of his ability to fight injustice.', '12 Angry Men': 'A jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider the evidence.', "Schindler's List": 'In German-occupied Poland during World War II, Oskar Schindler gradually becomes concerned for his Jewish workfor

In [515]:
#Etape 2 – Chaîne de traitements TAL

import nltk
import string
from nltk.stem import WordNetLemmatizer # package pour la lemmatisation

def nlp(sent) :
    '''
        Entrée : chaîne de caractères (str)
        Sortie : liste de tokens 
    '''
    # nettoyage ponctuations
    sent_without_punct = sent.translate(str.maketrans('', '', string.punctuation))
    
    # segmentation / tokenization
    tokens = nltk.word_tokenize(sent_without_punct, language='english')
    
    # liste des mots vides
    nltk_stop_words = nltk.corpus.stopwords.words('english')
    
    # suppression des mots vides
    tokens_without_stop_words = [t for t in tokens if t not in nltk_stop_words]
    
    # lemmatisation
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens_without_stop_words]
    
    return lemmatized_tokens


In [516]:
# test fonction nlp
 # à compléter
sentences = "The Shawshank Redemption is a 1994 American drama film written and directed by Frank Darabont, based on the 1982 Stephen King novella Rita Hayworth and Shawshank Redemption. It tells the story of banker Andy Dufresne (Tim Robbins), who is sentenced to life in Shawshank State Penitentiary for the murders of his wife and her lover, despite his claims of innocence. Over the following two decades, he befriends a fellow prisoner, contraband smuggler Ellis Boyd 'Red' Redding (Morgan Freeman), and becomes instrumental in a money-laundering operation led by the prison warden Samuel Norton (Bob Gunton). The film received widespread critical acclaim, with many reviewers citing it as one of the greatest films ever made. It was nominated for seven Academy Awards, including Best Picture, and won the awards for Best Adapted Screenplay and Best Supporting Actor (Freeman). It was also selected for preservation in the National Film Registry by the Library of Congress as being 'culturally, historically, or aesthetically significant."
#for mot in sentences.split():
    #print(nlp(mot))
nlp(sentences)

['The',
 'Shawshank',
 'Redemption',
 '1994',
 'American',
 'drama',
 'film',
 'written',
 'directed',
 'Frank',
 'Darabont',
 'based',
 '1982',
 'Stephen',
 'King',
 'novella',
 'Rita',
 'Hayworth',
 'Shawshank',
 'Redemption',
 'It',
 'tell',
 'story',
 'banker',
 'Andy',
 'Dufresne',
 'Tim',
 'Robbins',
 'sentenced',
 'life',
 'Shawshank',
 'State',
 'Penitentiary',
 'murder',
 'wife',
 'lover',
 'despite',
 'claim',
 'innocence',
 'Over',
 'following',
 'two',
 'decade',
 'befriends',
 'fellow',
 'prisoner',
 'contraband',
 'smuggler',
 'Ellis',
 'Boyd',
 'Red',
 'Redding',
 'Morgan',
 'Freeman',
 'becomes',
 'instrumental',
 'moneylaundering',
 'operation',
 'led',
 'prison',
 'warden',
 'Samuel',
 'Norton',
 'Bob',
 'Gunton',
 'The',
 'film',
 'received',
 'widespread',
 'critical',
 'acclaim',
 'many',
 'reviewer',
 'citing',
 'one',
 'greatest',
 'film',
 'ever',
 'made',
 'It',
 'nominated',
 'seven',
 'Academy',
 'Awards',
 'including',
 'Best',
 'Picture',
 'award',
 'Best',

In [517]:
#Etape 2 – Application de la chaîne de traitements TAL au dictionnaire 

def nlp_dico(dictionnaire) :
    '''
        Entrée : dictionnaire
        Sortie : dictionnaire (la chaîne de traitements a été appliquée pour les 'values')
    '''
    processed_dico_films = {}
    for key, value in dictionnaire.items():
        doc = nlp(value)
        processed_dico_films[key] = doc
    return processed_dico_films

file_path = "/Users/chloe/Documents/toto/films.json"
dico_films = jsonToDic(file_path)
processed_dico_films = nlp_dico(dico_films)

# afficher le dictionnaire des films
#print(dico_films)

# afficher le résumé d'un film spécifique
print(processed_dico_films)

{'The Shawshank Redemption': ['Two', 'imprisoned', 'men', 'bond', 'number', 'year', 'finding', 'solace', 'eventual', 'redemption', 'act', 'common', 'decency'], 'The Godfather': ['The', 'aging', 'patriarch', 'organized', 'crime', 'dynasty', 'transfer', 'control', 'clandestine', 'empire', 'reluctant', 'son'], 'The Godfather: Part II': ['The', 'early', 'life', 'career', 'Vito', 'Corleone', '1920s', 'New', 'York', 'portrayed', 'son', 'Michael', 'expands', 'tightens', 'grip', 'family', 'crime', 'syndicate'], 'The Dark Knight': ['When', 'menace', 'known', 'Joker', 'emerges', 'mysterious', 'past', 'wreaks', 'havoc', 'chaos', 'people', 'Gotham', 'Dark', 'Knight', 'must', 'accept', 'one', 'greatest', 'psychological', 'physical', 'test', 'ability', 'fight', 'injustice'], '12 Angry Men': ['A', 'jury', 'holdout', 'attempt', 'prevent', 'miscarriage', 'justice', 'forcing', 'colleague', 'reconsider', 'evidence'], "Schindler's List": ['In', 'Germanoccupied', 'Poland', 'World', 'War', 'II', 'Oskar', 'S

In [518]:
def jaccard_similarity_titre(titre1, titre2):
    '''
        Entrée : deux chaînes de caractères correspondant à des titres
        Sortie : score de similarité de Jaccard entre ces deux listes 
    '''
    T1 = set(titre1)
    T2 = set(titre2)
    intersection = len(T1.intersection(T2))
    union = len(T1.union(T2))
    return float(len(T1.intersection(T2))/len(T1.union(T2)))
    #return float(intersection)/union

titre1 = "The Godfather"
titre2 = "The Godfather: Part II"

def jaccard_similarity_titre(titre1, titre2):
    '''
        Entrée : deux chaînes de caractères correspondant à des titres
        Sortie : score de similarité de Jaccard entre ces deux listes 
    '''
 # à compléter
    film1 = set(titre1)
    film2 = set(titre2)
    intersection = len(film1.intersection(film2))
    union = len(film1.union(film2))
    return float(len(film1.intersection(film2))/len(film1.union(film2)))
    #return float(intersection) / union

film1 = processed_dico_films[titre1]
film2 = processed_dico_films[titre2]

jaccard_similarity_titre(film1,film2)
#jaccard_similarity_titre(titre1,titre2)

0.1111111111111111

In [519]:
#test Jaccard

titre1 = "The Godfather"
titre2 = "The Godfather: Part II"


film1 = processed_dico_films[titre1]
film2 = processed_dico_films[titre2]

set1 = set(film1)
set2 = set(film2)

score = jaccard_similarity(set1, set2)

print(film1)
print(film2)
print("\nle Score de simiarité de Jaccard des films {} et {} est {}".format(titre1,titre2,jaccard_similarity(film1, film2)))

['The', 'aging', 'patriarch', 'organized', 'crime', 'dynasty', 'transfer', 'control', 'clandestine', 'empire', 'reluctant', 'son']
['The', 'early', 'life', 'career', 'Vito', 'Corleone', '1920s', 'New', 'York', 'portrayed', 'son', 'Michael', 'expands', 'tightens', 'grip', 'family', 'crime', 'syndicate']

le Score de simiarité de Jaccard des films The Godfather et The Godfather: Part II est 0.1111111111111111


In [520]:
import math
#Etape 4 – Calcul de la similarité cosinus
#tutoriel : https://github.com/mayank408/TFIDF/blob/master/TFIDF.ipynb

def dico_films_to_vocabulary(dico) :
    '''
        Entrée : dictionnaire
        Sortie : liste de tokens correspondant au vocabulaire de la 'value'
    '''
    vocabulary = []
    for k in dico :
        for t in dico[k] :
            if t not in vocabulary :
                vocabulary.append(t)
    return vocabulary


def calcul_IDF(dico):
    '''
        Entrée : dictionnaire
        Sortie : dictionnaire : (clé = token, value = score IDF)
    '''
     # à compléter
    #  conseil pour la formule idf    = math.log10(nb_docs / nb_docs_having_voc)
  
    nb_docs = len(dico)
    token_counts = {}  # Pour stocker le nombre de documents dans lesquels chaque token apparaît
    for doc in dico.values():
        # On utilise un set pour éviter de compter plusieurs fois un même token dans un même document
        for token in set(doc):
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    
    idf_scores = {}
    for token, count in token_counts.items():
        idf_scores[token] = math.log10(nb_docs / count)
    
    return idf_scores


vocab = list(calcul_IDF(processed_dico_films).keys())

In [521]:
def calcul_TFIDF(dico):
    '''
        Entrée : dictionnaire
        Sortie : dictionnaire : (clé = token, value = score TFIDF)
    '''
    # à compléter
    #  conseil pour la formule tfidf = tf * idf
    idf_scores = calcul_IDF(dico)
    tfidf_scores = {}
    for doc, tokens in dico.items():
        tf_scores = {}
        for token in tokens:
            if token not in tf_scores:
                tf_scores[token] = 0
            tf_scores[token] += 1
        for token, score in tf_scores.items():
            tf_scores[token] = score / len(tokens)
        for token, score in tf_scores.items():
            tf_scores[token] = score * idf_scores[token]
        tfidf_scores[doc] = tf_scores
    return tfidf_scores

dicTFIDF = calcul_TFIDF(processed_dico_films)
#tfidf_dico_films = calcul_TFIDF(processed_dico_films)
#for film in tfidf_dico_films:
    #print(film, tfidf_dico_films)

In [522]:
def cosine_similarity(list1, list2):
    import numpy as np
    dot = np.dot(list1, list2)
    norm1 = np.linalg.norm(list1)
    norm2 = np.linalg.norm(list2)
    cos = dot / (norm1 * norm2)
    return(cos)

In [523]:
#for vocab:
dico = {}
for film in dicTFIDF:
    print(film)
    dico[film] = {}
    for mot in vocab:
        if mot not in dicTFIDF[film]:
            dico[film][mot] = 0
        else:
            dico[film][mot] = dicTFIDF[film][mot]
    #print(dico[film])

The Shawshank Redemption
The Godfather
The Godfather: Part II
The Dark Knight
12 Angry Men
Schindler's List
The Lord of the Rings: The Return of the King
Pulp Fiction
Fight Club
The Lord of the Rings: The Fellowship of the Ring
Forrest Gump
Star Wars: Episode V - The Empire Strikes Back
Inception
The Lord of the Rings: The Two Towers
One Flew Over the Cuckoo's Nest
Goodfellas
The Matrix
Star Wars: Episode IV - A New Hope
Se7en
It's a Wonderful Life
The Silence of the Lambs
The Usual Suspects
Léon: The Professional
Saving Private Ryan
City Lights
Interstellar
American History X
Modern Times
Casablanca
The Green Mile
Psycho
Raiders of the Lost Ark
The Pianist
Rear Window
The Departed
Whiplash
Terminator 2: Judgment Day
Back to the Future
Gladiator
The Lion King
The Prestige
Apocalypse Now
Memento
The Great Dictator
Sunset Boulevard
Alien
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
Paths of Glory
Django Unchained
The Shining
Witness for the Prosecution
The Dark Kn

In [524]:
tfidf_dico_films = dico

film1 = list(tfidf_dico_films["A Christmas Story"].values())
film2 = list(tfidf_dico_films["The Nightmare Before Christmas"].values())

film3 = list(tfidf_dico_films["The Godfather"].values())
film4 = list(tfidf_dico_films["The Godfather: Part II"].values())

print(cosine_similarity(film1, film2))
print(cosine_similarity(film3, film4))


0.10757874998175106
0.08241160159435393


In [525]:
#Etape 5 – Enfin la recommandation (avec similarité de Jaccard)
def sort_dict_by_values(dico) :
    return sorted(dico.items(), key=lambda x:(-x[1],x[0]))

def recommend_jaccard(liked_film, processed_dico_films) :
    
    # initialisation du dictionnaire de films candidats 
    candidats = {}
     # à compléter
    films = {}
    for film in processed_dico_films:
        if film != liked_film:
            films[film] = jaccard_similarity(processed_dico_films[liked_film], processed_dico_films[film])
    return sort_dict_by_values(films)

        # on vérifie que le candidat n'est pas le film lui même

In [526]:
#Etape 5 – Enfin la recommandation (avec similarité cosinus)
def recommend_cosine(liked_film, tfidf_dico_films):
    vector_liked_film = tfidf_dico_films[liked_film]
    # initialisation du dictionnaire de films candidats 
    candidate_films_with_similarity_scores = {}
    for candidate in processed_dico_films.keys():
        # on vérifie que le candidat n'est pas le film lui même
        if candidate != liked_film:            
            
            tokens_candidate_film = processed_dico_films[candidate]
            candidate_films_with_similarity_scores[candidate] = cosine_similarity(vector_liked_film, tfidf_dico_films[candidate])
    
    return sorted(candidate_films_with_similarity_scores.items(), key=lambda x:(-x[1],x[0]))

In [527]:
# Etape 6 – Démonstration
file = '/Users/chloe/Documents/toto/films.json'
dico_films = jsonToDic(file)
processed_dico_films = {}

# Calcul des recommendations basées sur la similarité Jaccard
jaccard_similarities = []
for i in range(3):
    current_film = list(dico_films.keys())[i]
    current_film_set = set(dico_films[current_film])
    jaccard_similarities.append((current_film, 0))  # Valeur de similarité par défaut
    for film, features in dico_films.items():
        if film != current_film:
            features_set = set(features)
            similarity = len(current_film_set.intersection(features_set)) / len(current_film_set.union(features_set))
            if similarity > jaccard_similarities[i][1]:
                jaccard_similarities[i] = (film, similarity)
print("La recommandation basée sur la mesure de similarité de Jaccard :")
print(jaccard_similarities)

# Calcul des recommendations basées sur la similarité cosinus TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Création d'une matrice TF-IDF
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(dico_films.values())
cosine_similarities = []
for i in range(3):
    current_film = list(dico_films.keys())[i]
    current_index = list(dico_films.keys()).index(current_film)
    current_tf_idf_vector = tf_idf_matrix[current_index]
    cosine_similarities.append((current_film, 0))  # Valeur de similarité par défaut 
    for j in range(len(dico_films)):
        if j != current_index:
            tf_idf_vector = tf_idf_matrix[j]
            similarity = (current_tf_idf_vector * tf_idf_vector.T).A[0][0]
            if similarity > cosine_similarities[i][1]:
                cosine_similarities[i] = (list(dico_films.keys())[j], similarity)
print("La recommandation basée sur la mesure de cosine sur TF-IDF :")
print(cosine_similarities)


La recommandation basée sur la mesure de similarité de Jaccard :
[('Young Frankenstein', 0.9615384615384616), ('Modern Times', 0.875), ('Spider-Man: Homecoming', 0.7105263157894737)]
La recommandation basée sur la mesure de cosine sur TF-IDF :
[('Pulp Fiction', 0.19522920726791662), ('The Godfather: Part II', 0.13640454398569057), ('Spider-Man: Homecoming', 0.18430999957687488)]
