# Decision tree project: diabetes prediction

## Notebook set-up

In [69]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.impute import KNNImputer
from sklearn.tree import DecisionTreeClassifier
import matplotlib
import sqlite3
import json


%matplotlib inline

In [70]:



# URL of the CSV file
url1 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv'
url2 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv'

# Read the CSV file from the URL and save it as a DataFrame
df_movies = pd.read_csv(url1, delimiter=',')


df_credits = pd.read_csv(url2, delimiter=',')

# Save the DataFrame to a CSV file in your project directory
df_movies.to_csv('../data/movies', index=False)
df_credits.to_csv('../data/credits', index=False)

print("CSV files has been downloaded and saved as a DataFrame.")

CSV files has been downloaded and saved as a DataFrame.


In [71]:

print(df_movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [72]:
print(df_credits.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB
None


In [73]:
conn = sqlite3.connect("db_movies")  # Use "movies.db" to save persistently
cursor = conn.cursor()

In [74]:
df_credits.to_sql("credits", conn, index=False, if_exists="replace")
df_movies.to_sql("movies", conn, index=False, if_exists="replace")

4803

In [75]:
query = """
SELECT movies.*, credits.cast, credits.crew
FROM movies
JOIN credits ON movies.title = credits.title
"""
df_combined = pd.read_sql(query, conn)

In [76]:
print(df_combined.head())  # View first few rows

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [77]:
print(df_combined.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [78]:
conn.close()

In [79]:
df_clean = df_combined[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [80]:
df_clean.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [81]:
encoded_data_df = df_clean.copy()

In [82]:
df_clean["genres"]

0       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
1       [{"id": 12, "name": "Adventure"}, {"id": 14, "...
2       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
3       [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4       [{"id": 28, "name": "Action"}, {"id": 12, "nam...
                              ...                        
4804    [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4805    [{"id": 35, "name": "Comedy"}, {"id": 10749, "...
4806    [{"id": 35, "name": "Comedy"}, {"id": 18, "nam...
4807                                                   []
4808                  [{"id": 99, "name": "Documentary"}]
Name: genres, Length: 4809, dtype: object

In [83]:
encoded_data_df['cast']=df_clean['cast'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)


In [84]:
print(encoded_data_df['cast'])

0        [Sam Worthington, Zoe Saldana, Sigourney Weaver]
1           [Johnny Depp, Orlando Bloom, Keira Knightley]
2            [Daniel Craig, Christoph Waltz, Léa Seydoux]
3            [Christian Bale, Michael Caine, Gary Oldman]
4          [Taylor Kitsch, Lynn Collins, Samantha Morton]
                              ...                        
4804    [Carlos Gallardo, Jaime de Hoyos, Peter Marqua...
4805         [Edward Burns, Kerry Bishé, Marsha Dietlein]
4806           [Eric Mabius, Kristin Booth, Crystal Lowe]
4807            [Daniel Henney, Eliza Coupe, Bill Paxton]
4808    [Drew Barrymore, Brian Herzlinger, Corey Feldman]
Name: cast, Length: 4809, dtype: object


In [85]:
encoded_data_df['genres']=df_clean['genres'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)


In [86]:
encoded_data_df['crew']=df_clean['crew'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)


In [87]:
encoded_data_df['keywords']=df_clean['keywords'].apply(lambda x: [item['name'] for item in json.loads(x)][:3] if pd.notna(x) else None)


In [88]:
encoded_data_df

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy]","[culture clash, future, space war]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[Stephen E. Rivkin, Rick Carter, Christopher B..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island]","[Johnny Depp, Orlando Bloom, Keira Knightley]","[Dariusz Wolski, Gore Verbinski, Jerry Bruckhe..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent]","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[Thomas Newman, Sam Mendes, Anna Pinnock]"
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama]","[dc comics, crime fighter, terrorist]","[Christian Bale, Michael Caine, Gary Oldman]","[Hans Zimmer, Charles Roven, Christopher Nolan]"
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion]","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[Andrew Stanton, Andrew Stanton, John Lasseter]"
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms]","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua...","[Robert Rodriguez, Robert Rodriguez, Robert Ro..."
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein]","[Edward Burns, Edward Burns, Edward Burns]"
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","[Comedy, Drama, Romance]","[date, love at first sight, narration]","[Eric Mabius, Kristin Booth, Crystal Lowe]","[Carla Hetland, Harvey Kahn, Adam Sliwinski]"
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"[Daniel Henney, Eliza Coupe, Bill Paxton]","[Daniel Hsia, Daniel Hsia]"


In [89]:
clean_df = encoded_data_df 

In [90]:
clean_df.isna().sum()   


id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [91]:

clean_df[clean_df['overview'].isna() | (clean_df['overview'].str.strip() == '')]

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
2658,370980,Chiamatemi Francesco - Il Papa della gente,,[Drama],"[pope, biography]","[Rodrigo de la Serna, Sergio Hernández, Àlex B...","[Daniele Luchetti, Daniele Luchetti, Kevin Kaska]"
4145,459488,"To Be Frank, Sinatra at 100",,[Documentary],"[music, actors, legendary perfomer]",[Tony Oppedisano],[Simon Napier-Bell]
4407,43630,The Helix... Loaded,,"[Action, Comedy, Science Fiction]",[],[],[]
4437,292539,Food Chains,,[Documentary],[],[],[Sanjay Rawal]


In [92]:

mask = clean_df['keywords'].map(lambda x: (len(x)==0))   

clean_df[mask]

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
71,1735,The Mummy: Tomb of the Dragon Emperor,"Archaeologist Rick O'Connell travels to China,...","[Adventure, Action, Fantasy]",[],"[Brendan Fraser, Jet Li, John Hannah]","[Ronna Kress, Stephen Sommers, Sean Daniel]"
83,79698,The Lovers,The Lovers is an epic romance time travel adve...,"[Action, Adventure, Science Fiction]",[],"[Josh Hartnett, Simone Kessell, Tamsin Egerton]","[Terry Ryan, Uma Da Cunha, Richard Conway]"
323,37786,Sex and the City 2,"Carrie, Charlotte, and Miranda are all married...","[Comedy, Drama, Romance]",[],"[Sarah Jessica Parker, Kristin Davis, Cynthia ...","[Sarah Jessica Parker, Patricia Field, Michael..."
381,49852,The Nutcracker: The Untold Story,"Set in 1920's Vienna, this a tale of a little ...","[Fantasy, Action, Family]",[],"[Elle Fanning, Nathan Lane, John Turturro]","[Chris Solimine, Andrei Konchalovsky, Andrei K..."
436,109418,Grown Ups 2,The all-star comedy cast from Grown Ups return...,[Comedy],[],"[Adam Sandler, Kevin James, Chris Rock]","[Rupert Gregson-Williams, Barry Bernardi, Theo..."
...,...,...,...,...,...,...,...
4796,13898,The Circle,Various women struggle to function in the oppr...,"[Drama, Foreign]",[],"[Nargess Mamizadeh, Maryiam Palvin Almani, Moj...","[Kambuzia Partovi, Jafar Panahi]"
4800,286939,Sanctuary: Quite a Conundrum,"It should have been just a normal day of sex, ...","[Thriller, Horror, Comedy]",[],"[Sasha Ramos, Erin Cline, Emily Rogers]","[Thomas L. Phillips, Thomas L. Phillips, Thoma..."
4803,67238,Cavite,"Adam, a security guard, travels from Californi...","[Foreign, Thriller]",[],[],"[Neill Dela Llana, Ian Gamazon]"
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"[Comedy, Romance]",[],"[Edward Burns, Kerry Bishé, Marsha Dietlein]","[Edward Burns, Edward Burns, Edward Burns]"


In [93]:

mask = clean_df['cast'].map(lambda x: (len(x)==0))   

clean_df[mask]

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
2603,17644,Barney's Great Adventure,"Mom and dad dump son Cody, daughter Abby, her ...",[Family],[],[],[Steve Gomer]
3674,447027,Running Forever,After being estranged since her mother's death...,[Family],[],[],[]
3997,346081,Sardaarji,A ghost hunter uses bottles to capture trouble...,[],[],[],[Rohit Jugraj]
4014,126509,2016: Obama's America,2016: Obama's America takes audiences on a gri...,[Documentary],[],[],"[Gerald R. Molen, John Sullivan, John Sullivan]"
4073,371085,Sharkskin,The Post War II story of Manhattan born Mike E...,[],[],[],[]
4123,325140,Hum To Mohabbat Karega,"Raju, a waiter, is in love with the famous TV ...",[],[],[],[]
4252,361505,Me You and Five Bucks,"A womanizing yet lovable loser, Charlie, a wai...","[Romance, Comedy, Drama]",[],[],[]
4311,114065,Down & Out With The Dolls,"The raunchy, spunky tale of the rise and fall ...","[Comedy, Music]",[],[],[]
4320,137955,Crowsnest,"In late summer of 2011, five young friends on ...",[],[],[],[]
4328,102840,Sex With Strangers,"For some married couples, sex is an obsession ...",[Documentary],[],[],[]


In [94]:

mask = clean_df['crew'].map(lambda x: (len(x)==0))   

clean_df[mask]

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
3665,19615,Flying By,A real estate developer goes to his 25th high ...,[Drama],[],"[Billy Ray Cyrus, Heather Locklear, Ahnaise Ch...",[]
3674,447027,Running Forever,After being estranged since her mother's death...,[Family],[],[],[]
3982,55831,Boynton Beach Club,A handful of men and women of a certain age pi...,"[Comedy, Drama, Romance]",[independent film],"[Brenda Vaccaro, Dyan Cannon, Joseph Bologna]",[]
4073,371085,Sharkskin,The Post War II story of Manhattan born Mike E...,[],[],[],[]
4110,48382,"The Book of Mormon Movie, Volume 1: The Journey",The story of Lehi and his wife Sariah and thei...,[],[],"[Kirby Heyborne, Michael Flynn]",[]
4123,325140,Hum To Mohabbat Karega,"Raju, a waiter, is in love with the famous TV ...",[],[],[],[]
4128,20653,Roadside Romeo,This is the story of Romeo. A dude who was liv...,"[Animation, Family, Foreign]",[],"[Saif Ali Khan, Kareena Kapoor, Javed Jaffrey]",[]
4252,361505,Me You and Five Bucks,"A womanizing yet lovable loser, Charlie, a wai...","[Romance, Comedy, Drama]",[],[],[]
4311,114065,Down & Out With The Dolls,"The raunchy, spunky tale of the rise and fall ...","[Comedy, Music]",[],[],[]
4320,137955,Crowsnest,"In late summer of 2011, five young friends on ...",[],[],[],[]


In [95]:

mask = clean_df['title'].map(lambda x: (len(x)==0))   

clean_df[mask]

Unnamed: 0,id,title,overview,genres,keywords,cast,crew


In [96]:
clean_df['genres'] = clean_df['genres'].astype(str).str.replace(' ', '', regex=True)
clean_df['cast'] = clean_df['cast'].astype(str).str.replace(' ', '', regex=True)
clean_df['keywords'] = clean_df['keywords'].astype(str).str.replace(' ', '', regex=True)
clean_df['crew'] = clean_df['crew'].astype(str).str.replace(' ', '', regex=True)


In [97]:
clean_df 



Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","['Action','Adventure','Fantasy']","['cultureclash','future','spacewar']","['SamWorthington','ZoeSaldana','SigourneyWeaver']","['StephenE.Rivkin','RickCarter','ChristopherBo..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","['Adventure','Fantasy','Action']","['ocean','drugabuse','exoticisland']","['JohnnyDepp','OrlandoBloom','KeiraKnightley']","['DariuszWolski','GoreVerbinski','JerryBruckhe..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"['Action','Adventure','Crime']","['spy','basedonnovel','secretagent']","['DanielCraig','ChristophWaltz','LéaSeydoux']","['ThomasNewman','SamMendes','AnnaPinnock']"
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"['Action','Crime','Drama']","['dccomics','crimefighter','terrorist']","['ChristianBale','MichaelCaine','GaryOldman']","['HansZimmer','CharlesRoven','ChristopherNolan']"
4,49529,John Carter,"John Carter is a war-weary, former military ca...","['Action','Adventure','ScienceFiction']","['basedonnovel','mars','medallion']","['TaylorKitsch','LynnCollins','SamanthaMorton']","['AndrewStanton','AndrewStanton','JohnLasseter']"
...,...,...,...,...,...,...,...
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"['Action','Crime','Thriller']","['unitedstates–mexicobarrier','legs','arms']","['CarlosGallardo','JaimedeHoyos','PeterMarquar...","['RobertRodriguez','RobertRodriguez','RobertRo..."
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"['Comedy','Romance']",[],"['EdwardBurns','KerryBishé','MarshaDietlein']","['EdwardBurns','EdwardBurns','EdwardBurns']"
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","['Comedy','Drama','Romance']","['date','loveatfirstsight','narration']","['EricMabius','KristinBooth','CrystalLowe']","['CarlaHetland','HarveyKahn','AdamSliwinski']"
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"['DanielHenney','ElizaCoupe','BillPaxton']","['DanielHsia','DanielHsia']"


In [98]:
clean_df.drop(columns=['id'])

Unnamed: 0,title,overview,genres,keywords,cast,crew
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","['Action','Adventure','Fantasy']","['cultureclash','future','spacewar']","['SamWorthington','ZoeSaldana','SigourneyWeaver']","['StephenE.Rivkin','RickCarter','ChristopherBo..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","['Adventure','Fantasy','Action']","['ocean','drugabuse','exoticisland']","['JohnnyDepp','OrlandoBloom','KeiraKnightley']","['DariuszWolski','GoreVerbinski','JerryBruckhe..."
2,Spectre,A cryptic message from Bond’s past sends him o...,"['Action','Adventure','Crime']","['spy','basedonnovel','secretagent']","['DanielCraig','ChristophWaltz','LéaSeydoux']","['ThomasNewman','SamMendes','AnnaPinnock']"
3,The Dark Knight Rises,Following the death of District Attorney Harve...,"['Action','Crime','Drama']","['dccomics','crimefighter','terrorist']","['ChristianBale','MichaelCaine','GaryOldman']","['HansZimmer','CharlesRoven','ChristopherNolan']"
4,John Carter,"John Carter is a war-weary, former military ca...","['Action','Adventure','ScienceFiction']","['basedonnovel','mars','medallion']","['TaylorKitsch','LynnCollins','SamanthaMorton']","['AndrewStanton','AndrewStanton','JohnLasseter']"
...,...,...,...,...,...,...
4804,El Mariachi,El Mariachi just wants to play his guitar and ...,"['Action','Crime','Thriller']","['unitedstates–mexicobarrier','legs','arms']","['CarlosGallardo','JaimedeHoyos','PeterMarquar...","['RobertRodriguez','RobertRodriguez','RobertRo..."
4805,Newlyweds,A newlywed couple's honeymoon is upended by th...,"['Comedy','Romance']",[],"['EdwardBurns','KerryBishé','MarshaDietlein']","['EdwardBurns','EdwardBurns','EdwardBurns']"
4806,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","['Comedy','Drama','Romance']","['date','loveatfirstsight','narration']","['EricMabius','KristinBooth','CrystalLowe']","['CarlaHetland','HarveyKahn','AdamSliwinski']"
4807,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"['DanielHenney','ElizaCoupe','BillPaxton']","['DanielHsia','DanielHsia']"


In [101]:
!pip install ace_tools



In [102]:
# List of columns to merge
columns_to_merge = ["overview", "genres", "keywords", "cast", "crew"]

# Create the 'tags' column by merging the selected columns (assuming spaces between elements)
clean_df["tags"] = clean_df[columns_to_merge].astype(str).agg(' '.join, axis=1)

# Drop all other columns except 'title' and 'tags'
clean_df = clean_df[['title', 'tags']]

# Display the final DataFrame
import ace_tools as tools
tools.display_dataframe_to_user(name="Processed Data", dataframe=clean_df)

KeyError: "None of [Index(['overview', 'genres', 'keywords', 'cast', 'crew'], dtype='object')] are in the [columns]"

In [103]:
clean_df 

Unnamed: 0,title,tags
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."
...,...,...
4804,El Mariachi,El Mariachi just wants to play his guitar and ...
4805,Newlyweds,A newlywed couple's honeymoon is upended by th...
4806,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic..."
4807,Shanghai Calling,When ambitious New York attorney Sam is sent t...


In [107]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(clean_df["tags"])

model = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='cosine')
model.fit(tfidf_matrix)

def recommend_movies(movie_name):
    movie_index = clean_df[clean_df['title'] == movie_name].index[0]
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
    similar_movies = [(clean_df['title'][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]




In [112]:
input_movie = "How to Train Your Dragon"
recommend_movies(input_movie)

print("Film recommendations '{}'".format(input_movie))
for movie, distance in recommend_movies(input_movie):
    print(f"{movie} {distance:.2f}")

Film recommendations 'How to Train Your Dragon'
How to Train Your Dragon 2 0.74
Dragon Nest: Warriors' Dawn 0.80
Kung Fu Panda 2 0.84
Pete's Dragon 0.85


In [113]:
from pickle import dump

dump(model, open("KNN_Movies", "wb"))