# Nettoyage movies_tags

In [126]:
import pandas as pd

## Datasets

In [200]:
df_tags = pd.read_csv('../data/raw/tags.csv')
df_tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [201]:
df_movies = pd.read_csv('../data/raw/movies.csv')
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [202]:
df_movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [203]:
df_genome_tags = pd.read_csv('../src/data/raw/genome-tags.csv')
df_genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [204]:
df_genome_scores = pd.read_csv('../src/data/raw/genome-scores.csv')
df_genome_scores.head(100)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02500
1,1,2,0.02500
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675
...,...,...,...
95,1,96,0.04350
96,1,97,0.06250
97,1,98,0.05675
98,1,99,0.12725


## Features

In [205]:
df_tags_relevance = pd.merge(df_genome_tags, df_genome_scores, on='tagId', how='outer')
df_tags_relevance.head()

Unnamed: 0,tagId,tag,movieId,relevance
0,1,7,1,0.025
1,1,7,2,0.03975
2,1,7,3,0.0435
3,1,7,4,0.03725
4,1,7,5,0.042


In [206]:
df_tags_relevance.isnull().sum()

tagId        0
tag          0
movieId      0
relevance    0
dtype: int64

In [207]:
df_top_relevance = df_tags_relevance.groupby('movieId').apply(lambda x: x.nlargest(3, 'relevance')).reset_index(drop=True)
df_top_relevance.head()

  df_top_relevance = df_tags_relevance.groupby('movieId').apply(lambda x: x.nlargest(3, 'relevance')).reset_index(drop=True)


Unnamed: 0,tagId,tag,movieId,relevance
0,1036,toys,1,0.99925
1,244,computer animation,1,0.9985
2,786,pixar animation,1,0.996
3,29,adventure,2,0.981
4,584,jungle,2,0.967


In [208]:
df_top_relevance_grouped = df_top_relevance.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()
df_top_relevance_grouped.rename(columns={'tag': 'tags'}, inplace=True)
df_top_relevance_grouped

Unnamed: 0,movieId,tags
0,1,"toys, computer animation, pixar animation"
1,2,"adventure, jungle, children"
2,3,"good sequel, sequel, sequels"
3,4,"women, chick flick, girlie movie"
4,5,"good sequel, father daughter relationship, seq..."
...,...,...
10376,130578,"assassination, realistic action, action"
10377,130840,"romance, horror, love story"
10378,131013,"comedy, stupid as hell, prison"
10379,131168,"betrayal, criterion, obsession"


In [209]:
df_tags['tag'] = df_tags['tag'].astype(str)
df_tags_grouped = df_tags.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()
df_tags_grouped.rename(columns={'tag': 'tags'}, inplace=True)
df_tags_grouped

Unnamed: 0,movieId,tags
0,1,"Watched, computer animation, Disney animated f..."
1,2,"time travel, adapted from:book, board game, ch..."
2,3,"old people that is actually funny, sequel feve..."
3,4,"chick flick, revenge, characters, chick flick,..."
4,5,"Diane Keaton, family, sequel, Steve Martin, we..."
...,...,...
19540,131054,dinosaurs
19541,131082,"documentary, Yoshitomo Nara"
19542,131164,Vietnam War
19543,131170,alternate reality


In [210]:
df_total_tags = pd.merge(df_top_relevance_grouped, df_tags_grouped, on='movieId', how='outer')
df_total_tags['tags_x'] = df_total_tags['tags_x'].astype(str)
df_total_tags['tags_y'] = df_total_tags['tags_y'].astype(str)

df_total_tags['tags'] = df_total_tags['tags_x'] + ', ' + df_total_tags['tags_y']
df_total_tags.drop(['tags_x', 'tags_y'], axis=1, inplace=True)

df_total_tags

Unnamed: 0,movieId,tags
0,1,"toys, computer animation, pixar animation, Wat..."
1,2,"adventure, jungle, children, time travel, adap..."
2,3,"good sequel, sequel, sequels, old people that ..."
3,4,"women, chick flick, girlie movie, chick flick,..."
4,5,"good sequel, father daughter relationship, seq..."
...,...,...
20094,131082,"nan, documentary, Yoshitomo Nara"
20095,131164,"nan, Vietnam War"
20096,131168,"betrayal, criterion, obsession, nan"
20097,131170,"alternate reality, sci-fi, parallel universe, ..."


In [211]:
df_total_tags = pd.merge(df_movies, df_total_tags, on='movieId', how='outer')
df_total_tags

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"toys, computer animation, pixar animation, Wat..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"adventure, jungle, children, time travel, adap..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"good sequel, sequel, sequels, old people that ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"women, chick flick, girlie movie, chick flick,..."
4,5,Father of the Bride Part II (1995),Comedy,"good sequel, father daughter relationship, seq..."
...,...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),Comedy,
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,
27275,131258,The Pirates (2014),Adventure,"nan, bandits, Korea, mutiny, pirates, whale"
27276,131260,Rentun Ruusu (2001),(no genres listed),


In [212]:
df_total_tags['genres'] = df_total_tags['genres'].str.replace('|', ', ')
df_total_tags

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy","toys, computer animation, pixar animation, Wat..."
1,2,Jumanji (1995),"Adventure, Children, Fantasy","adventure, jungle, children, time travel, adap..."
2,3,Grumpier Old Men (1995),"Comedy, Romance","good sequel, sequel, sequels, old people that ..."
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance","women, chick flick, girlie movie, chick flick,..."
4,5,Father of the Bride Part II (1995),Comedy,"good sequel, father daughter relationship, seq..."
...,...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),Comedy,
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,
27275,131258,The Pirates (2014),Adventure,"nan, bandits, Korea, mutiny, pirates, whale"
27276,131260,Rentun Ruusu (2001),(no genres listed),


In [213]:
df_total_tags['genres'] = df_total_tags['genres'].astype(str)
df_total_tags['tags'] = df_total_tags['tags'].astype(str)

df_total_tags['all_tags'] = df_total_tags['genres'] + ', ' + df_total_tags['tags']
df_total_tags.drop(['genres', 'tags'], axis=1, inplace=True)
df_total_tags

Unnamed: 0,movieId,title,all_tags
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantas..."
1,2,Jumanji (1995),"Adventure, Children, Fantasy, adventure, jungl..."
2,3,Grumpier Old Men (1995),"Comedy, Romance, good sequel, sequel, sequels,..."
3,4,Waiting to Exhale (1995),"Comedy, Drama, Romance, women, chick flick, gi..."
4,5,Father of the Bride Part II (1995),"Comedy, good sequel, father daughter relations..."
...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),"Comedy, nan"
27274,131256,"Feuer, Eis & Dosenbier (2002)","Comedy, nan"
27275,131258,The Pirates (2014),"Adventure, nan, bandits, Korea, mutiny, pirate..."
27276,131260,Rentun Ruusu (2001),"(no genres listed), nan"


In [214]:
df_total_tags['all_tags'] = df_total_tags['all_tags'].apply(lambda x: x.split(','))
df_total_tags.head(1).values

array([[1, 'Toy Story (1995)',
        list(['Adventure', ' Animation', ' Children', ' Comedy', ' Fantasy', ' toys', ' computer animation', ' pixar animation', ' Watched', ' computer animation', ' Disney animated feature', ' Pixar animation', ' TÃ\x83Â©a Leoni does not star in this movie', ' Pixar', ' animation', ' family', ' Tom Hanks', ' Pixar', ' witty', ' Pixar', ' adventure', ' animated', ' animation', ' clever', ' comedy', ' computer animation', ' family', ' fantasy', ' Tom Hanks', ' bright', ' DARING RESCUES', ' fanciful', ' HEROIC MISSION', ' humorous', ' light', ' rousing', ' TOYS COME TO LIFE', ' UNLIKELY FRIENDSHIPS', ' warm', ' witty', ' animation', ' humorous', ' Pixar', ' time travel', ' Pixar', ' Pixar animation', ' animation', ' kids movie', ' Pixar', ' Pixar', ' Pixar', ' witty', ' Disney', ' Tim Allen', ' time travel', ' action figure', ' action figures', ' Buzz Lightyear', ' CG animation', ' toy', ' toys', ' Woody', ' animation', ' Pixar', ' animation', ' Disney', ' 

In [215]:
import re

def clean_tags(tags_list):
    """
    Nettoie les tags en ne conservant que ceux composés uniquement de lettres et de tirets.

    Parameters
    ----------
    tags_list : list
        Liste de tags.

    Returns
    -------
    list
        Liste nettoyée de tags.
    """
    clean_tags_list = []
    for tag in tags_list:
        if not isinstance(tag, str):
            continue
        cleaned_tag = tag.strip().lower()
        if cleaned_tag == 'nan':
            continue
        if re.match(r'^[a-zA-Z\-]+$', cleaned_tag):
            clean_tags_list.append(cleaned_tag)
    clean_tags_list = list(set(clean_tags_list))
    return clean_tags_list


df_total_tags['all_tags'] = df_total_tags['all_tags'].apply(clean_tags)

print(df_total_tags.head(1).values)


[[1 'Toy Story (1995)'
  list(['clever', 'rated-g', 'children', 'innovative', 'animated', 'adventure', 'watched', 'bright', 'want', 'woody', 'comedy', 'clv', 'engaging', 'classic', 'cartoon', 'soothing', 'usa', 'buy', 'fantasy', 'fun', 'toys', 'disney', 'cgi', 'friendship', 'bd-video', 'humorous', 'rousing', 'pixar', 'warm', 'animation', 'funny', 'avi', 'story', 'witty', 'toy', 'light', 'dolls', 'fanciful', 'cute', 'dvd-video', 'family'])]]


In [216]:
df_total_tags['all_tags'] = df_total_tags['all_tags'].apply(lambda x: ', '.join(x))

In [217]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

matrice_tfidf = tfidf.fit_transform(df_total_tags['all_tags'])

print(matrice_tfidf.shape)

(27278, 7877)


In [218]:
from sklearn.metrics.pairwise import cosine_similarity

sim_cosinus = cosine_similarity(matrice_tfidf, matrice_tfidf)

In [219]:
indices = pd.Series(range(0,len(df_total_tags)), index=df_total_tags.title)

In [220]:
from tabulate import tabulate
def recommandations(titre, cos_sim, num_recommendations = 3):
    idx = indices[titre]

    scores_similarite = list(enumerate(cos_sim[idx]))

    scores_similarite = sorted(scores_similarite, key=lambda x: x[1], reverse=True)

    top_similair = scores_similarite[1:num_recommendations+1]

    res = [(indices.index[idx], score) for idx, score in top_similair]

    return tabulate(res, headers=["Titre", "Score de similarité"], tablefmt="pretty")
   

In [196]:
print("Recommandations pour 'Toy Story (1995)' similarité euclidienne: \n",recommandations('Toy Story (1995)', sim_cosinus))

Recommandations pour 'Toy Story (1995)' similarité euclidienne: 
 +-----------------------+---------------------+
|         Titre         | Score de similarité |
+-----------------------+---------------------+
|  Toy Story 2 (1999)   | 0.5558230184318719  |
| Monsters, Inc. (2001) | 0.40623140536357955 |
|      Cars (2006)      | 0.3172251771679121  |
+-----------------------+---------------------+
