In [1]:
# IMPORTANT : si l'execution déclenche une erreur et demande nltk.download, 
# faire executer ces 2 lignes : 
# nltk.download('punkt')
# nltk.download("stopwords")

In [77]:
import requests
import nltk
import re
import numpy as np

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from string import punctuation
from nltk.probability import FreqDist
from collections import defaultdict
from heapq import nlargest
import nltk

#---------------------------------------------
# FONCTION
# input : director name (case insensitive)
# output : cleaned article retrieved
#---------------------------------------------
def GetWikipediaSummary(director):
    # Récupération de la page exacte wikipedia (après redirection) à partir du nom de l'auteur non normalisé
    director.replace(" ","+")
    mysearchURL = "https://en.wikipedia.org/w/index.php?search="+director
    mysummaryURL = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&exintro&explaintext&titles="
    http = requests.get(mysearchURL)
    #On vire les 30 premiers caractères pour trouver le nom normalisé de l'auteur
    exactURL = mysummaryURL + http.url[30:]
    #Récupération du résumé
    fullresults= requests.get(exactURL).text
    summaryindex = fullresults.find("extract")
    summary =fullresults[summaryindex+10:-5]

    #CLEANUP
    summary = summary.lower()
    
    #On vire les requetes qui n'ont rien donné sur wikipedia
    if summary.find("may refer")!=-1:
        return ""
    if summary.find("normalized")!=-1:
        return ""
    
    #on vire les caractères spéciaux & les chiffres & les slashs
    summary.encode('ascii', 'replace')
    summary=re.sub("[0-9\\\\]+","",summary)
    
    #On vire le nom/prénom du directeur
    for director_part in director.lower().split(" "):
        summary = summary.replace(director_part,"")
    
    return summary

In [379]:
# Le plan
# 1. Récupérer pour les 250 réalisateurs les mieux notés en moyenne
# 2. Appliquer une pondération TF-IDF (Term Frequency / Inverse Document Frequency)
# 3. Lancer un algo de clustering

#Step 1
df = pd.read_csv("favorite_directors.csv", sep=";", low_memory=False, names = ["director_name"])
df.set_index("director_name",inplace=True)

In [380]:
#on vire les index dupliqués éventuellement
df = df[~df.index.duplicated(keep='first')]

In [None]:
#Step 2
for x in range(len(df)):
    if type(df.iloc[x].name)!=str:
        df.loc[df.iloc[x].name,"summary"]=""
    else:
        summary = GetWikipediaSummary(df.iloc[x].name)
        print("Retrieving ("+str(x)+ "/" + str(len(df)) +") summary for : "+df.iloc[x].name)
        df.loc[df.iloc[x].name,"summary"]=summary
#On vire les entrées sans résumé
df_backup = df.copy()

Retrieving (0/220) summary for : Werner Herzog
Retrieving (1/220) summary for : François Truffaut
Retrieving (2/220) summary for : Fritz Lang
Retrieving (3/220) summary for : Eric Rohmer
Retrieving (4/220) summary for : Akira Kurosawa
Retrieving (5/220) summary for : Alfred Hitchcock
Retrieving (6/220) summary for : Luis Buñuel
Retrieving (7/220) summary for : Howard Hawks
Retrieving (8/220) summary for : Ingmar Bergman
Retrieving (9/220) summary for : Charles Chaplin
Retrieving (10/220) summary for : Robert Bresson
Retrieving (11/220) summary for : Michael Powell & Emeric Pressburger
Retrieving (12/220) summary for : Alain Resnais
Retrieving (13/220) summary for : Joseph Losey
Retrieving (14/220) summary for : Kinji Fukasuku
Retrieving (15/220) summary for : Josef von Sternberg
Retrieving (16/220) summary for : Billy Wilder
Retrieving (17/220) summary for : Joseph L. Mankiewicz
Retrieving (18/220) summary for : Agnès Varda
Retrieving (19/220) summary for : Roberto Rossellini
Retrievin

In [335]:
df=df_backup
#on vire toutes les entrées sans résumé
df_final = df[df["summary"]!=""].copy()
#On transforme en liste
liste = list(df["summary"])

In [372]:
#On clusterise
nb_cluster = 12
#On applique la transformation TFIDF
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,stop_words="english")
X = vectorizer.fit_transform(liste)
km = KMeans(n_clusters = nb_cluster ,init="k-means++",max_iter=100,n_init=1,verbose=True)
km.fit(X)
np.unique(km.labels_,return_counts=True)

Initialization complete
Iteration  0, inertia 1582.508
Iteration  1, inertia 781.204
Iteration  2, inertia 775.801
Iteration  3, inertia 773.370
Iteration  4, inertia 772.049
Iteration  5, inertia 770.249
Iteration  6, inertia 769.187
Iteration  7, inertia 768.788
Iteration  8, inertia 768.638
Iteration  9, inertia 768.628
Converged at iteration 9: center shift 0.000000e+00 within tolerance 2.039296e-08


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 array([143,  84,  33,  94,  41, 276,  83,  60,  26,  14,  98,  40], dtype=int64))

In [373]:
#On récupère les mots les plus courants dans chaque cluster
#Step 1 : récupération du texte total de tous les clusters
text={}
for i,cluster in enumerate(km.labels_):
    oneDoc = liste[i]
    if cluster not in text.keys():
        text[cluster]=oneDoc
    else:
        text[cluster]+=oneDoc

In [374]:
#Step 2 : décompte du nb d'occurence de tous les mots et on garde les 100 plus fréquents par cluster
_stopwords = set (stopwords.words("english")+list(punctuation)+["film"])
keywords={}
counts={}
for cluster in range (nb_cluster):
    word_sent = word_tokenize(text[cluster].lower())
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    keywords[cluster]=nlargest(100,freq,key=freq.get)
    counts[cluster]=freq


In [375]:
#Step 3 : on ne garde que les mots d'un cluster qui ne sont pas présents dans les autres clusters
unique_keys={}
for cluster in range(nb_cluster):
    other_clusters=list(set(range(nb_cluster))-set([cluster]))
    keys_other_clusters=set(keywords[other_clusters[0]]).union(set(keywords[other_clusters[1]]))
    unique=set(keywords[cluster])-keys_other_clusters
    unique_keys[cluster]=nlargest(10,unique,key=counts[cluster].get)

In [376]:
unique_keys

{0: ['series',
  'horror',
  'writing',
  'well',
  'sequel',
  'company',
  'thriller',
  'british',
  'dead',
  'superhero'],
 1: ['french',
  'wave',
  'les',
  'le',
  'la',
  'critics',
  'de',
  'paris',
  'january',
  'considered'],
 2: ['effects',
  'special',
  'use',
  'life',
  'movies',
  'united',
  'animated',
  'hollywood',
  'part',
  'disney'],
 3: ['awards',
  'became',
  'critical',
  'york',
  '.n',
  'years',
  'art',
  'success',
  'hollywood',
  'nominated'],
 4: ['polish',
  'international',
  'years',
  'poland',
  'awards',
  'became',
  'prize',
  'soviet',
  'entered',
  'life'],
 5: ['hong',
  'kong',
  'chinese',
  'martial',
  'german',
  'russian',
  'romanian',
  'arts',
  'thai',
  'written'],
 6: ['awards',
  'nominated',
  'screenplay',
  'golden',
  'nominations',
  'original',
  'globe',
  'bafta',
  'oscar',
  'winning'],
 7: ['cannes',
  'international',
  'prize',
  'jury',
  "d'or",
  'palme',
  'romanian',
  'th',
  'berlin',
  'golden'],
 8: 

In [377]:
df["labels"]=km.labels_

In [378]:
#Ecriture dans un fichier
df.to_csv("temp.csv",encoding='utf-8', sep=";")