In [24]:
import pandas as pd

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import json

In [25]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\meria\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
df_actors = pd.read_csv("actors_top500_final.csv")
df_movies = pd.read_csv("movies_details_top500_final.csv")
df_real = pd.read_csv("realisators_top500_final.csv")

In [27]:
def preprocess_text(text):
    try:
        text = re.sub('[^a-zA-Z0-9\s]', '', text)
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if len(word) > 2 and word.isalnum() and word not in stopwords.words('english') and word != '']
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return ' '.join(tokens)
    except Exception as e:
        print(f"Une erreur s'est produite : {e}")
        return ''


def extract_tags(descriptions, min_df=5):
    preprocessed_descriptions = [preprocess_text(description) for description in descriptions]

    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=min_df)
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_descriptions)

    tags = tfidf_vectorizer.get_feature_names_out()

    return tags, tfidf_matrix

# movie

In [28]:
df_movies_drops = ["adult", "backdrop_path", "original_language", "original_title", "video", "overview"]
df_movies_bis = df_movies.drop(columns=df_movies_drops)

In [29]:
col_app = ["id", "title", "poster_path"]
df_movies_app = df_movies_bis.loc[:, col_app]

print(df_movies_app)

df_movies_app.to_csv('../data/clean_datas/movies_app.csv', index=False)

         id                               title  \
0       278            The Shawshank Redemption   
1       238                       The Godfather   
2       240               The Godfather Part II   
3       424                    Schindler's List   
4       389                        12 Angry Men   
..      ...                                 ...   
495     631       Sunrise: A Song of Two Humans   
496      89  Indiana Jones and the Last Crusade   
497  579245                        The Specials   
498  546554                          Knives Out   
499    5924                            Papillon   

                          poster_path  
0    /q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg  
1    /3bhkrj58Vtu7enYsRolD1fZdja1.jpg  
2    /hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg  
3    /sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg  
4    /ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg  
..                                ...  
495  /oj8ZW8jKXBSs8F1e5iWsTUeXSJW.jpg  
496  /sizg1AU8f8JDZX4QIgE4pjUMBvx.jpg  
497  /zJziqrnSOzKiV0TrNVZ3A

# tags

In [30]:
tags, tfidf_matrix = extract_tags(df_movies["overview"])

print(f"il y a {len(tags)} tags")

il y a 537 tags


In [31]:
dict_tag = {}
for i, description in enumerate(df_movies["overview"]):
    id = int(df_movies["title"].iloc[i])
    dict_tag[id] = [tags[j] for j in tfidf_matrix[i].indices]

In [40]:
with open('../data/clean_datas/movies_tags.json', 'w') as f:
    json.dump(dict_tag, f)

# reals_actors

In [33]:
actor_matrix = df_actors.set_index("Actor")
actor_matrix.index.name = None
actor_matrix = actor_matrix.T
actor_matrix = actor_matrix.set_index(actor_matrix.index.astype(int)).sort_index()

In [34]:
print(actor_matrix)

         Tim Robbins  Morgan Freeman  Bob Gunton  William Sadler  \
11                 0               0           0               0   
13                 0               0           0               0   
14                 0               0           0               0   
15                 0               0           0               0   
16                 0               0           0               0   
...              ...             ...         ...             ...   
1000492            0               0           0               0   
1002185            0               0           0               0   
1010581            0               0           0               0   
1026227            0               0           0               0   
1139087            0               0           0               0   

         Clancy Brown  Gil Bellows  James Whitmore  Mark Rolston  \
11                  0            0               0             0   
13                  0            0             

In [35]:
real_matrix = df_real.set_index("Realisator")
real_matrix.index.name = None
real_matrix = real_matrix.T
real_matrix = real_matrix.set_index(real_matrix.index.astype(int)).sort_index()
real_matrix = real_matrix.add_suffix(' as real')

In [36]:
print(real_matrix)

         Frank Darabont as real  Francis Ford Coppola as real  \
11                            0                             0   
13                            0                             0   
14                            0                             0   
15                            0                             0   
16                            0                             0   
...                         ...                           ...   
1000492                       0                             0   
1002185                       0                             0   
1010581                       0                             0   
1026227                       0                             0   
1139087                       0                             0   

         Steven Spielberg as real  Sidney Lumet as real  \
11                              0                     0   
13                              0                     0   
14                              0         

In [37]:
df_real_actors = actor_matrix.join(real_matrix)

df_real_actors.to_csv('../data/clean_datas/real_actors.csv', index=False)

In [38]:
print(df_real_actors)

         Tim Robbins  Morgan Freeman  Bob Gunton  William Sadler  \
11                 0               0           0               0   
13                 0               0           0               0   
14                 0               0           0               0   
15                 0               0           0               0   
16                 0               0           0               0   
...              ...             ...         ...             ...   
1000492            0               0           0               0   
1002185            0               0           0               0   
1010581            0               0           0               0   
1026227            0               0           0               0   
1139087            0               0           0               0   

         Clancy Brown  Gil Bellows  James Whitmore  Mark Rolston  \
11                  0            0               0             0   
13                  0            0             

# merged

In [39]:
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tags, index=df_movies["id"])
df_tfidf = df_tfidf.set_index(df_tfidf.index.astype(int)).sort_index()

df_merged = (df_tfidf != 0).astype(int).join(actor_matrix).join(real_matrix)
df_merged = df_merged.fillna(0)