In [1]:
# IMPORTANT : si l'execution déclenche une erreur et demande nltk.download, 
# faire executer ces 2 lignes : 
# nltk.download('punkt')
# nltk.download("stopwords")

In [7]:
import requests
import nltk
import re

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from string import punctuation
#Initiatlisation du troncateur de mot
st=LancasterStemmer()
#fabrication de ma liste de stopwords
custo_stopwords = set(stopwords.words('english')+list(punctuation)+["film","director","born"])

#---------------------------------------------
# FONCTION
# input : director name (case insensitive)
# output : a dictionary of (word;frequency) based on wikipedia summary
#---------------------------------------------
def GetWikipediaTokenizedSummary(director):
    # Récupération de la page exacte wikipedia (après redirection) à partir du nom de l'auteur non normalisé
    director.replace(" ","+")
    mysearchURL = "https://en.wikipedia.org/w/index.php?search="+director
    mysummaryURL = "https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&exintro&explaintext&titles="
    http = requests.get(mysearchURL)
    #On vire les 30 premiers caractères pour trouver le nom normalisé de l'auteur
    exactURL = mysummaryURL + http.url[30:]
    #Récupération du résumé
    fullresults= requests.get(exactURL).text
    summaryindex = fullresults.find("extract")
    summary =fullresults[summaryindex+10:-5]

    #on vire les caractères spéciaux
    summary.encode('ascii', 'replace')

    #Tokenisation
    sents = sent_tokenize(summary)
    words=[word_tokenize(sent) for sent in sents]

    #Nettoyage de la liste des mots selon certains critères, et décompte
    final_words = dict()
    for sent in words:
        for word in sent:
            word = word.lower()

            # On ignore le nom de l'auteur
            found=False
            for director_part in director.split(" "):
                if word.find(director_part.lower())!=-1:
                    found=True
            if found:
                continue

            #Finalement je ne tronque pas les mots, ça gêne l'interpretation
            #word = st.stem(word)

            # On ignore les mots avec des digits ou antislash dans la liste des stopwords
            if word in custo_stopwords or re.search("[0-9\\\\]+", word):
                continue
            # On ignore les mots de 2 lettres
            if len(word)<3 :
                continue

            #Si on arrive là alors on peut compter le mot    
            if word in final_words:
                final_words[word] += 1
            else:
                final_words[word] = 1
    return final_words
#Si on veut afficher le résultat en triant par occurences décroissantes
#for key in sorted(final_words, key=final_words.get, reverse=True):
#    print(key+":"+str(final_words[key]))

In [16]:
# Le plan
# 1. Récupérer pour les 250 réalisateurs les mieux notés en moyenne
# 2. Créer une matrice géante, index = réalisateur, colonnes = tous les mots
# 3. Appliquer une pondération IDF (pondérer chaque mot par l'inverse de la fréquence)
# 4. Lancer un algo de clustering

#Step 1
df = pd.read_csv("top250_acclaimed_directors.csv", sep=";", low_memory=False, names = ["id", "director_name"])
df.set_index("director_name",inplace=True)
df.drop("id",axis=1,inplace=True)

In [44]:
x=0
#Step 2
for x in range(len(df)):
    if x>10:
        break
    dict_tokens = GetWikipediaTokenizedSummary(df.iloc[x].name)
    print("Retrieving ("+str(x)+ "/250) summary for : "+df.iloc[x].name+","+str(len(dict_tokens))+" distinct words")
    for word in dict_tokens:
        print(word+":"+str(dict_tokens[word]))
    x+=1


Retrieving (0/250) summary for : Todd Kessler,17 distinct words
award:1
winning:1
american:1
television:1
writer:1
producer:2
among:1
credits:1
showrunner:1
co-creator:1
nickelodeon:1
preschool:1
series:1
blue:1
clues:1
feature:1
keith:1
Retrieving (1/250) summary for : John Krokidas,12 distinct words
october:1
american:1
screenwriter:1
producer:1
best:1
known:1
directorial:1
debut:1
biographical:1
drama:1
kill:1
darlings:1
Retrieving (2/250) summary for : Josh Boone,7 distinct words
may:1
refer:1
basketball:2
american:3
footballer:1
soccer:1
player:1
Retrieving (3/250) summary for : David Foenkinos,22 distinct words
october:1
french:1
author:1
screenwriter:1
studied:1
literature:1
music:1
paris:1
novel:2
bestseller:1
france:1
based:1
book:1
released:1
december:1
audrey:1
tautou:1
main:1
character:1
awarded:1
prix:1
renaudot:1
Retrieving (4/250) summary for : Michael Grandage,11 distinct words
cbe:1
may:1
british:1
theatre:1
producer:1
currently:1
artistic:2
company:1
donmar:1
warehous

In [45]:
df.fillna(0, inplace=True)
df.tail()

Unnamed: 0_level_0,award,winning,american,television,writer,producer,among,credits,showrunner,co-creator,...,connection,aaron,burr,conspiracy,never,tried,national,political,career,recovered
director_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0
