In [1]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords  # Import des stopwords
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer  # Import du lemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load Dataset
PATH = 'src'
data = pd.read_csv(PATH + '/plot_summaries.txt', sep='\t', names=['ID', 'Plot'])
documents_list= data['Plot'].tolist()

In [2]:
data.head()

Unnamed: 0,ID,Plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [3]:
# Télécharger les ressources nécessaires
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialiser le lemmatizer et les mots vides
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))  # Utilisez 'french' si nécessaire

# Fonction de prétraitement qui inclut nettoyage, tokenisation, et lemmatisation
def custom_tokenizer(text):
    # Mise en minuscules
    text = text.lower()
    # Tokenisation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    # Supprimer mots vides (stopwords) et lemmatisation
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Vectorisation TF-IDF avec le tokenizer personnalisé
tfidf = TfidfVectorizer(lowercase=False,  # Déjà mis en minuscules dans custom_tokenizer
                        tokenizer=custom_tokenizer,
                        token_pattern=None)  # None pour éviter l'avertissement

# Appliquer le fit_transform aux documents
train_data = tfidf.fit_transform(documents_list)

In [4]:
# Define the number of topics or components
num_components=6

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [5]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['film', 'one', 'life', 'find', 'man', 'get', 'woman']
Topic 1:  ['kmdb', 'goku', 'eng', 'kr', 'md_basic', 'pudgy', 'title2009']
Topic 2:  ['cheech', 'fantaghirò', 'mcquade', 'anakin', 'shivan', 'ahab', 'fogg']
Topic 3:  ['love', 'family', 'father', 'son', 'story', 'life', 'get']
Topic 4:  ['uuno', 'vamsi', 'laure', 'sanju', 'quantrill', 'giovanna', 'aby']
Topic 5:  ['tora', 'bfi', 'ftvdb', 'sift', 'org', 'uk', '01']
