In [1]:
# IMPORTANT : si l'execution déclenche une erreur et demande nltk.download, 
# faire executer ces 2 lignes : 
# nltk.download('punkt')
# nltk.download("stopwords")

In [416]:
import requests
import nltk
import re
import numpy as np

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from string import punctuation
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest
import nltk

#---------------------------------------------
# FONCTION
# input : director name (case insensitive)
# output : cleaned article retrieved
#---------------------------------------------
def GetWikipediaSummary(director):
    # Récupération de la page exacte wikipedia (après redirection) à partir du nom de l'auteur non normalisé
    director.replace(" ","+")
    mysearchURL = "https://en.wikipedia.org/w/index.php?search="+director
    mysummaryURL = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&exintro&explaintext&titles="
    http = requests.get(mysearchURL)
    #On vire les 30 premiers caractères pour trouver le nom normalisé de l'auteur
    exactURL = mysummaryURL + http.url[30:]
    #Récupération du résumé
    fullresults= requests.get(exactURL).text
    summaryindex = fullresults.find("extract")
    summary =fullresults[summaryindex+10:-5]

    #CLEANUP
    summary = summary.lower()
    
    #On vire les requetes qui n'ont rien donné sur wikipedia
    if summary.find("may refer")!=-1:
        return ""
    if summary.find("normalized")!=-1:
        return ""
    
    #on vire les caractères spéciaux & les chiffres & les slashs
    summary.encode('ascii', 'replace')
    summary=re.sub("[0-9\\\\]+","",summary)

    
    #On vire le nom/prénom du directeur
    for director_part in director.lower().split(" "):
        summary = summary.replace(director_part,"")
    
    return summary

In [417]:
# Le plan
# 1. Récupérer pour les 250 réalisateurs les mieux notés en moyenne
# 2. Appliquer une pondération TF-IDF (Term Frequency / Inverse Document Frequency)
# 3. Lancer un algo de clustering

#Step 1
df = pd.read_csv("favorite_directors.csv", sep=";", low_memory=False, names = ["director_name"])
df.set_index("director_name",inplace=True)

In [418]:
#on vire les index dupliqués éventuellement
df = df[~df.index.duplicated(keep='first')]

In [419]:
#Step 2
for x in range(len(df)):
    if type(df.iloc[x].name)!=str:
        df.loc[df.iloc[x].name,"summary"]=""
    else:
        summary = GetWikipediaSummary(df.iloc[x].name)
        print("Retrieving ("+str(x)+ "/" + str(len(df)) +") summary for : "+df.iloc[x].name)
        df.loc[df.iloc[x].name,"summary"]=summary
#On vire les entrées sans résumé
df_backup = df.copy()

Retrieving (0/220) summary for : Werner Herzog
Retrieving (1/220) summary for : François Truffaut
Retrieving (2/220) summary for : Fritz Lang
Retrieving (3/220) summary for : Eric Rohmer
Retrieving (4/220) summary for : Akira Kurosawa
Retrieving (5/220) summary for : Alfred Hitchcock
Retrieving (6/220) summary for : Luis Buñuel
Retrieving (7/220) summary for : Howard Hawks
Retrieving (8/220) summary for : Ingmar Bergman
Retrieving (9/220) summary for : Charles Chaplin
Retrieving (10/220) summary for : Robert Bresson
Retrieving (11/220) summary for : Michael Powell & Emeric Pressburger
Retrieving (12/220) summary for : Alain Resnais
Retrieving (13/220) summary for : Joseph Losey
Retrieving (14/220) summary for : Kinji Fukasuku
Retrieving (15/220) summary for : Josef von Sternberg
Retrieving (16/220) summary for : Billy Wilder
Retrieving (17/220) summary for : Joseph L. Mankiewicz
Retrieving (18/220) summary for : Agnès Varda
Retrieving (19/220) summary for : Roberto Rossellini
Retrievin

Retrieving (167/220) summary for : Terrence Malick
Retrieving (168/220) summary for : Richard Donner
Retrieving (169/220) summary for : Tim Burton
Retrieving (170/220) summary for : Richard Thorpe
Retrieving (171/220) summary for : Kenji Misumi
Retrieving (172/220) summary for : Ridley Scott
Retrieving (173/220) summary for : W.S. Van Dyke
Retrieving (174/220) summary for : Robert Aldrich
Retrieving (175/220) summary for : Wolfgang Petersen
Retrieving (176/220) summary for : Woody Allen
Retrieving (177/220) summary for : Leos Carax
Retrieving (178/220) summary for : Jean Renoir
Retrieving (179/220) summary for : Ann Hui
Retrieving (180/220) summary for : John Frankenheimer
Retrieving (181/220) summary for : Céline Sciamma
Retrieving (182/220) summary for : Anthony Asquith
Retrieving (183/220) summary for : Charles Vidor
Retrieving (184/220) summary for : Jerzy Kawalerowicz
Retrieving (185/220) summary for : Chia-Liang Liu
Retrieving (186/220) summary for : Georges Franju
Retrieving (18

In [451]:
#on vire les rows avec un summary vide & on convertit en liste
df = df[df["summary"]!=""]
liste = list(df["summary"])

In [460]:
#seconde passe de nettoyage : on vire les mois qui polluent : les mois
polluting_words =["january","february","march","april","may","june","july","august","september","october","november","december"]
for x in range(len(liste)):
    for polluting_word in polluting_words:
        liste[x]=liste[x].replace(polluting_word,"")

In [474]:
#On clusterise
nb_cluster = 9
#On applique la transformation TFIDF
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words="english")
X = vectorizer.fit_transform(liste)
km = KMeans(n_clusters = nb_cluster ,init="k-means++",max_iter=100,n_init=1,verbose=True)
km.fit(X)
np.unique(km.labels_,return_counts=True)

Initialization complete
Iteration  0, inertia 327.721
Iteration  1, inertia 179.382
Iteration  2, inertia 179.078
Converged at iteration 2: center shift 0.000000e+00 within tolerance 5.850324e-08


(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([ 5, 30, 45, 42, 17, 14, 19, 27,  8], dtype=int64))

In [475]:
#On récupère les mots les plus courants dans chaque cluster
#Step 1 : récupération du texte total de tous les clusters
text={}
for i,cluster in enumerate(km.labels_):
    oneDoc = liste[i]
    if cluster not in text.keys():
        text[cluster]=oneDoc
    else:
        text[cluster]+=oneDoc
        
#Step 2 : décompte du nb d'occurence de tous les mots et on garde les 100 plus fréquents par cluster
_stopwords = set (stopwords.words("english")+list(punctuation))
keywords={}
counts={}
for cluster in range (nb_cluster):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster]=nlargest(100,freq,key=freq.get)
    counts[cluster]=freq
    
#Step 3 : on ne garde que les mots d'un cluster qui ne sont pas présents dans les autres clusters
unique_keys={}
for cluster in range(nb_cluster):
    other_clusters=list(set(range(nb_cluster))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster]=nlargest(10,unique,key=counts[cluster].get)

unique_keys

{0: ['danish',
  'co-founded',
  'hard',
  'die',
  'hunt',
  'investigator',
  'last',
  'dogme',
  'ordet',
  'space'],
 1: ['drama',
  'picture',
  'nominations',
  'comedy',
  'science',
  'fiction',
  'oscar',
  'dutch',
  'style',
  'actors'],
 2: ['united',
  'released',
  'several',
  'considered',
  'often',
  'direct',
  'produced',
  'war',
  'began',
  'life'],
 3: ['french',
  'la',
  'france',
  'international',
  'le',
  'les',
  'du',
  'documentary',
  'godard',
  'critic'],
 4: ['cannes',
  'german',
  'prize',
  'palme',
  "d'or",
  'international',
  'language',
  'jury',
  'french',
  'kieublowski'],
 5: ['japanese',
  'animator',
  'worked',
  'member',
  'fantasy',
  'polish',
  'dark',
  'python',
  'japan',
  'artist'],
 6: ['time',
  'sound',
  'influential',
  'buufuel',
  'foreign',
  'life',
  'often',
  'international',
  'soviet',
  'sight'],
 7: ['screenplay',
  'original',
  'nomination',
  'whose',
  'writing',
  'dozen',
  'acting',
  'awarded',
  'ti

In [476]:
df["labels"]=km.labels_
#Ecriture dans un fichier
df.to_csv("temp.csv",encoding='utf-8', sep=";")