In [1]:
# IMPORTANT : si l'execution déclenche une erreur et demande nltk.download, 
# faire executer ces 2 lignes : 
# nltk.download('punkt')
# nltk.download("stopwords")

In [77]:
import requests
import nltk
import re
import numpy as np

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from string import punctuation

#---------------------------------------------
# FONCTION
# input : director name (case insensitive)
# output : cleaned article retrieved
#---------------------------------------------
def GetWikipediaSummary(director):
    # Récupération de la page exacte wikipedia (après redirection) à partir du nom de l'auteur non normalisé
    director.replace(" ","+")
    mysearchURL = "https://en.wikipedia.org/w/index.php?search="+director
    mysummaryURL = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&exintro&explaintext&titles="
    http = requests.get(mysearchURL)
    #On vire les 30 premiers caractères pour trouver le nom normalisé de l'auteur
    exactURL = mysummaryURL + http.url[30:]
    #Récupération du résumé
    fullresults= requests.get(exactURL).text
    summaryindex = fullresults.find("extract")
    summary =fullresults[summaryindex+10:-5]

    #CLEANUP
    summary = summary.lower()
    
    #On vire les requetes qui n'ont rien donné sur wikipedia
    if summary.find("may refer")!=-1:
        return ""
    if summary.find("normalized")!=-1:
        return ""
    
    #on vire les caractères spéciaux & les chiffres & les slashs
    summary.encode('ascii', 'replace')
    summary=re.sub("[0-9\\\\]+","",summary)
    
    #On vire le nom/prénom du directeur
    for director_part in director.lower().split(" "):
        summary = summary.replace(director_part,"")
    
    return summary

In [28]:
# Le plan
# 1. Récupérer pour les 250 réalisateurs les mieux notés en moyenne
# 2. Appliquer une pondération TF-IDF (Term Frequency / Inverse Document Frequency)
# 3. Lancer un algo de clustering

#Step 1
df = pd.read_csv("top250_acclaimed_directors.csv", sep=";", low_memory=False, names = ["id", "director_name"])
df.set_index("director_name",inplace=True)

In [47]:
#Step 2
for x in range(len(df)):
    summary = GetWikipediaSummary(df.iloc[x].name)
    print("Retrieving ("+str(x)+ "/" + str(len(df)) +") summary for : "+df.iloc[x].name)
    df.loc[df.iloc[x].name,"summary"]=summary
#On vire les entrées sans résumé
df_backup = df.copy()

Retrieving (0/250) summary for : Todd Kessler
Retrieving (1/250) summary for : John Krokidas
Retrieving (2/250) summary for : Josh Boone
Retrieving (3/250) summary for : David Foenkinos
Retrieving (4/250) summary for : Michael Grandage
Retrieving (5/250) summary for : Lake Bell
Retrieving (6/250) summary for : Bruce Timm
Retrieving (7/250) summary for : Charlie Kaufman
Retrieving (8/250) summary for : Houda Benyamina
Retrieving (9/250) summary for : Chris McKay
Retrieving (10/250) summary for : Jonathan Dayton
Retrieving (11/250) summary for : Louis D'Esposito
Retrieving (12/250) summary for : Marco Berger
Retrieving (13/250) summary for : Jan Komasa
Retrieving (14/250) summary for : John Kahrs
Retrieving (15/250) summary for : Frank Pavich
Retrieving (16/250) summary for : Orlando von Einsiedel
Retrieving (17/250) summary for : Garth Davis
Retrieving (18/250) summary for : Rajkumar Hirani
Retrieving (19/250) summary for : Theodore Melfi
Retrieving (20/250) summary for : Adam Curtis
Re

Retrieving (168/250) summary for : Dean DeBlois
Retrieving (169/250) summary for : Barry Cook
Retrieving (170/250) summary for : Peter Jackson
Retrieving (171/250) summary for : Barry Jenkins
Retrieving (172/250) summary for : Kurt Kuenne
Retrieving (173/250) summary for : Jean-Pierre Jeunet
Retrieving (174/250) summary for : Philippe Garrel
Retrieving (175/250) summary for : Wes Anderson
Retrieving (176/250) summary for : Jonás Cuarón
Retrieving (177/250) summary for : Imtiaz Ali
Retrieving (178/250) summary for : Martin McDonagh
Retrieving (179/250) summary for : Chris Evans
Retrieving (180/250) summary for : Haifaa Al-Mansour
Retrieving (181/250) summary for : Shinichiro Watanabe
Retrieving (182/250) summary for : Babak Anvari
Retrieving (183/250) summary for : Dominic Polcino
Retrieving (184/250) summary for : Sebastian Schipper
Retrieving (185/250) summary for : Jorge Furtado
Retrieving (186/250) summary for : Floria Sigismondi
Retrieving (187/250) summary for : Taika Waititi
Retr

In [92]:
df = df_backup.copy()
#on vire toutes les entrées sans résumé
df_final = df[df["summary"]!=""].copy()
#On transforme en liste
liste = list(df["summary"])

In [93]:
#On applique la transformation TFIDF
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words="english")
X = vectorizer.fit_transform(liste)
X

<250x1280 sparse matrix of type '<class 'numpy.float64'>'
	with 7376 stored elements in Compressed Sparse Row format>

In [144]:
#On clusterise
nb_cluster = 3
km = KMeans(n_clusters = nb_cluster ,init="k-means++",max_iter=100,n_init=1,verbose=True)
km.fit(X)
np.unique(km.labels_,return_counts=True)

Initialization complete
Iteration  0, inertia 416.548
Iteration  1, inertia 205.389
Iteration  2, inertia 204.669
Iteration  3, inertia 204.378
Iteration  4, inertia 204.284
Iteration  5, inertia 204.228
Iteration  6, inertia 204.210
Converged at iteration 6: center shift 0.000000e+00 within tolerance 6.547372e-08


(array([0, 1, 2]), array([ 23, 135,  92], dtype=int64))

In [131]:
#On récupère les mots les plus courants dans chaque cluster
#Step 1 : récupération du texte total de tous les clusters
text={}
for i,cluster in enumerate(km.labels_):
    oneDoc = liste[i]
    if cluster not in text.keys():
        text[cluster]=oneDoc
    else:
        text[cluster]+=oneDoc

In [123]:
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest
import nltk

In [145]:
#Step 2 : décompte du nb d'occurence de tous les mots et on garde les 100 plus fréquents par cluster
_stopwords = set (stopwords.words("english")+list(punctuation)+["film"])
keywords={}
counts={}
for cluster in range (nb_cluster):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster]=nlargest(100,freq,key=freq.get)
    counts[cluster]=freq


In [152]:
#Step 3 : on ne garde que les mots d'un cluster qui ne sont pas présents dans les autres clusters
unique_keys={}
for cluster in range(nb_cluster):
    other_clusters=list(set(range(nb_cluster))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster]=nlargest(30,unique,key=counts[cluster].get)

In [153]:
unique_keys

{0: ['screenplay',
  'english',
  'original',
  'writing',
  'golden',
  'picture',
  'bafta',
  'history',
  'starring',
  'emmy',
  'greatest',
  'globe',
  'nominations',
  'winning',
  'november',
  'include',
  'bbc',
  'italian',
  'co-wrote',
  'sequel',
  'crime',
  'includes',
  'john',
  'channel',
  'acclaim',
  'black',
  'star',
  'blue',
  'adapted',
  'july'],
 1: ['animation',
  'animated',
  'animator',
  'disney',
  'artist',
  'studios',
  'worked',
  'studio',
  'walt',
  'japanese',
  'anime',
  'california',
  'pixar',
  'marvel',
  'served',
  'storyboard',
  'mickey',
  'direct',
  'movie',
  'list',
  'ranked',
  'ghost',
  'since',
  'ghibli',
  'arts',
  'secret',
  'spongebob',
  'day',
  'voice',
  'highest-grossing'],
 2: ['documentary',
  'french',
  'international',
  'cannes',
  'early',
  'march',
  'video',
  'national',
  'began',
  'editor',
  'december',
  'york',
  'sundance',
  'independent',
  'acclaimed',
  'war',
  'based',
  'starred',
  'pre

In [150]:
df["labels"]=km.labels_

In [151]:
#Ecriture dans un fichier
df.to_csv("temp.csv",encoding='utf-8', sep=";")