In [1]:
#Importo le librerie che mi serviranno
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
df = pd.read_csv("movies.csv")

In [3]:
#Stampo le prime dieci righe del dataset
df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
# Esploro il dataset
df.info()
df.describe()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


movieId    0
title      0
genres     0
dtype: int64

In [5]:
# Visualizzo le colonne del dataset
df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [6]:
# Per esempio, se abbiamo una colonna 'genres', possiamo usare il TF-IDF per trasformare i dati di testo in vettori
from sklearn.feature_extraction.text import TfidfVectorizer

# Eseguo TF-IDF sui generi
tfidf = TfidfVectorizer(stop_words='english')
df['genres'] = df['genres'].fillna('')  # Riempi i valori nulli con stringhe vuote
tfidf_matrix = tfidf.fit_transform(df['genres'])

# Visualizzo la forma della matrice TF-IDF
tfidf_matrix.shape


(62423, 23)

In [7]:
# Utilizzo NearestNeighbors per calcolare le similarità
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11)
knn.fit(tfidf_matrix)

# Creo una serie indicizzata dai titoli dei film
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Funzione di raccomandazione con gestione degli errori
def get_recommendations(title, model=knn, data=tfidf_matrix, indices=indices):
    # Verifico se il titolo è presente nel dataset
    if title not in indices:
        # Ricerca parziale nel dataset
        partial_matches = df[df['title'].str.contains(title, case=False, na=False)]['title']
        if partial_matches.empty:
            return f"Il film '{title}' non è presente nel dataset."
        else:
            # Prendo il primo risultato della ricerca parziale
            title = partial_matches.iloc[0]
    
    # Ottengo l'indice del film dato il suo titolo
    idx = indices[title]
    
    # Trovo i 10 vicini più vicini (11 inclusi se stesso)
    distances, indices = model.kneighbors(data[idx], n_neighbors=11)
    
    # Ottengo gli indici dei film consigliati
    movie_indices = indices.flatten()[1:]  # Ignora il primo che è il film stesso
    
    # Restituisco i titoli dei film consigliati
    return df['title'].iloc[movie_indices]

In [8]:
# Controllo alcuni titoli di film nel dataset
df['title'].head(20)

0                          Toy Story (1995)
1                            Jumanji (1995)
2                   Grumpier Old Men (1995)
3                  Waiting to Exhale (1995)
4        Father of the Bride Part II (1995)
5                               Heat (1995)
6                            Sabrina (1995)
7                       Tom and Huck (1995)
8                       Sudden Death (1995)
9                          GoldenEye (1995)
10           American President, The (1995)
11       Dracula: Dead and Loving It (1995)
12                             Balto (1995)
13                             Nixon (1995)
14                  Cutthroat Island (1995)
15                            Casino (1995)
16             Sense and Sensibility (1995)
17                        Four Rooms (1995)
18    Ace Ventura: When Nature Calls (1995)
19                       Money Train (1995)
Name: title, dtype: object

In [9]:
# Testo il sistema di raccomandazione con vari titoli di film
print(get_recommendations('The Godfather'))
print(get_recommendations('Toy Story'))
print(get_recommendations('Un film non presente'))

21326                      Spider (2007)
30230                     Company (2002)
40965            The Black Angels (1970)
17777    Bodyguards and Assassins (2009)
30226           Shaolin Intruders (1983)
30211                       Shiva (1989)
30171                         Run (2013)
30107              Gabbar Is Back (2015)
30079                 Mercenaries (2011)
9804                  TNT Jackson (1974)
Name: title, dtype: object
30472           Scooby-Doo! Mask of the Blue Falcon (2012)
58039                          Here Comes the Grump (2018)
57586            Dragons: Dawn Of The Dragon Racers (2014)
17431    Asterix and the Vikings (Astérix et les Viking...
52826                    Tangled: Before Ever After (2017)
22353                                Boxtrolls, The (2014)
60800                                     UglyDolls (2019)
55898                               Penguin Highway (2018)
48614         Puss in Book: Trapped in an Epic Tale (2017)
43614                           

In [10]:
# Ricerca parziale per "Godfather"
godfather_titles = df[df['title'].str.contains("Godfather", case=False, na=False)]['title']
print(godfather_titles)

# Ricerca parziale per "Toy Story"
toy_story_titles = df[df['title'].str.contains("Toy Story", case=False, na=False)]['title']
print(toy_story_titles)

840                                  Godfather, The (1972)
1190                        Godfather: Part II, The (1974)
1934                       Godfather: Part III, The (1990)
7894                               Tokyo Godfathers (2003)
8461                                   3 Godfathers (1948)
19261                           Last Godfather, The (2010)
20640                               Disco Godfather (1979)
24459                            The New Godfathers (1979)
25505                           The Black Godfather (1974)
25914                              Three Godfathers (1936)
33426                                     GodFather (1991)
33483                      Battle of the Godfathers (1973)
39338                 Onimasa: A Japanese Godfather (1982)
46275     The Medici: Godfathers of the Renaissance (2004)
48382    Herschell Gordon Lewis: The Godfather of Gore ...
53794                          The Godfather Legacy (2012)
53803           The Godfather Family: A Look Inside (199

In [11]:
# Ricerca parziale per "The Social Network"
toy_story_titles = df[df['title'].str.contains("Social Network", case=False, na=False)]['title']
print(toy_story_titles)

15238    Social Network, The (2010)
Name: title, dtype: object


In [12]:
print(get_recommendations("Casino"))

38102                      Two Men in Town (1973)
38076                          The Colonel (2006)
42724                              Payroll (1961)
5884                                 Sonny (2002)
1424                         Donnie Brasco (1997)
27847                            The Gypsy (1975)
27821    The Case of the Stuttering Bishop (1937)
22674                           The Stoker (2010)
12897                       Blackbird, The (1926)
11077                          Assassin(s) (1997)
Name: title, dtype: object


In [13]:
print(get_recommendations("Social Network, The"))

22641         Cambridge Spies (2003)
49695                   Shoes (1916)
20523      Dallas Buyers Club (2013)
31081    Fifteen and Pregnant (1998)
49685        Secret Superstar (2017)
49683           You Are Alone (2005)
6314          This Boy's Life (1993)
49708                 Bambule (1969)
16068             What Is It? (2005)
10833            Edvard Munch (1974)
Name: title, dtype: object


In [14]:
# Ricerca parziale per "Forza Maggiore" in originale "Turist"
toy_story_titles = df[df['title'].str.contains("Turist", case=False, na=False)]['title']
print(toy_story_titles)

11237                  Turistas (2006)
22311    Force Majeure (Turist) (2014)
54975           The Miniaturist (2017)
Name: title, dtype: object


In [15]:
print(get_recommendations("Force Majeure"))

22641         Cambridge Spies (2003)
49695                   Shoes (1916)
20523      Dallas Buyers Club (2013)
31081    Fifteen and Pregnant (1998)
49685        Secret Superstar (2017)
49683           You Are Alone (2005)
6314          This Boy's Life (1993)
49708                 Bambule (1969)
16068             What Is It? (2005)
10833            Edvard Munch (1974)
Name: title, dtype: object


In [16]:
# Ricerca parziale per "Apocalypse Now"
toy_story_titles = df[df['title'].str.contains("Apocalypse now", case=False, na=False)]['title']
print(toy_story_titles)

1177    Apocalypse Now (1979)
Name: title, dtype: object


In [17]:
print(get_recommendations("Apocalypse Now"))

22695                 Beyond the Border (2011)
12421    Warlords, The (Tau ming chong) (2007)
20234           Cockleshell Heroes, The (1955)
22685                        Going Back (2001)
12169                      Seas Beneath (1931)
6334                  Sink the Bismark! (1960)
48309                   The Last Bullet (1995)
11964                 Steel Helmet, The (1951)
4852                 Behind Enemy Lines (2001)
11879                          Beaufort (2007)
Name: title, dtype: object


In [19]:
print(get_recommendations("Children of Men"))

17456                  The Hunger Games (2012)
12146                            Jumper (2008)
32632                    Humanity's End (2009)
53251    Jurassic World: Fallen Kingdom (2018)
7735            Day After Tomorrow, The (2004)
11162                   Children of Men (2006)
23249                    Jurassic World (2015)
9266                Sound of Thunder, A (2005)
24155                Terminator Genisys (2015)
10103                           Stealth (2005)
Name: title, dtype: object
