In [1]:
!pip3 install neattext
import pandas as pd
import neattext.functions as nfx
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel
from difflib import SequenceMatcher

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"<username>","key":"<api_key>"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json
     

In [3]:
! kaggle datasets download -d grouplens/movielens-20m-dataset
! unzip movielens-20m-dataset.zip

Downloading movielens-20m-dataset.zip to /content
100% 195M/195M [00:01<00:00, 135MB/s]
100% 195M/195M [00:01<00:00, 144MB/s]
Archive:  movielens-20m-dataset.zip
  inflating: genome_scores.csv       
  inflating: genome_tags.csv         
  inflating: link.csv                
  inflating: movie.csv               
  inflating: rating.csv              
  inflating: tag.csv                 


In [4]:
movie_df = pd.read_csv('/content/movie.csv')
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


**Similarity Index By Genere**

In [78]:
def FindSimilarity(string1,string2):
  s = SequenceMatcher(None,string1,string2)
  return s.ratio()

def SortByGenereSimilarity(genere,dfx):
  similarity = []
  for index, row in dfx.iterrows():
    similarity.append(FindSimilarity(row['genres'],genere))
  df2 = dfx.assign(similar=similarity).sort_values('similar',ascending=False)
  return df2
df = movie_df[0:5]
genere = "Adventure|Animation|Children|Comedy|Fantasy"
SortByGenereSimilarity(genere,df)

Unnamed: 0,movieId,title,genres,similar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0.753623
2,3,Grumpier Old Men (1995),Comedy|Romance,0.315789
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0.285714
4,5,Father of the Bride Part II (1995),Comedy,0.244898


In [2]:
class RecommendMovies:
  def __init__(self):
    #create a dataframe 
    df = pd.read_csv('/content/movie.csv')

    #drop duplicates from the course_title column
    df = df.drop_duplicates(subset=['title'])
    # clean_title column to string from course_title
    df['clean_title'] = df['title'].astype(str)
    # clean_title remove stopwords
    df['clean_title'] = df['clean_title'].apply(nfx.remove_stopwords)
    # clean_title remove special characters
    df['clean_title'] = df['clean_title'].apply(nfx.remove_special_characters)

    # create CountVectorizer
    countvect = CountVectorizer()
    cv_mat = countvect.fit_transform(df['clean_title'])

    # create CV words
    df_cv_words = pd.DataFrame(cv_mat.todense(),columns=countvect.get_feature_names())
    self.df = df
    self.cosine_sim_mat = cosine_similarity(cv_mat)
    print("Data loaded Successfully")

  def FindSimilarity(self,string1,string2):
    s = SequenceMatcher(None,string1,string2)
    return s.ratio()

  def SortByGenereSimilarity(self,genere,dfx):
    similarity = []
    for index, row in dfx.iterrows():
      similarity.append(self.FindSimilarity(row['genres'],genere))
    df2 = dfx.assign(similar=similarity).sort_values('similar',ascending=False)
    return df2

  def autocomplete(self,query):
    result = self.df.loc[self.df['clean_title'].str.contains(query, case=False)]
    data = []
    for index, row in result[0:5].iterrows():
      data.append({
          "key":index,
          "value":row["clean_title"],
          "genres":row["genres"]
      })
    return data


  def recommend_movie(self,title,numrec = 10):
    movie_index = pd.Series( self.df.index, index=self.df['clean_title']).drop_duplicates()
    if title in movie_index:
      index = movie_index[title]
      genere = self.df.iloc[index]['genres']
      scores = list(enumerate(self.cosine_sim_mat [index]))
      sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
      selected_course_index = [i[0] for i in sorted_scores[1:]]
      selected_course_score = [i[1] for i in sorted_scores[1:]]
      rec_df = self.df.iloc[selected_course_index]
      rec_df['Similarity_Score'] = selected_course_score
      final_recommended_courses = rec_df[['title', 'genres', 'movieId']]
      final_recommended_courses = final_recommended_courses[0:numrec]
      return self.SortByGenereSimilarity(genere,final_recommended_courses)
      #return {"error":None,"data":final_recommended_courses.head(numrec).to_json(orient ='index')}
    else:
      return {"error": "course with title name "+title+" not found"}

recommendation = RecommendMovies()



Data loaded Successfully


In [4]:
recommendation.autocomplete("toy")

[{'key': 0,
  'value': 'Toy Story 1995',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy'},
 {'key': 1933,
  'value': 'Babes Toyland 1961',
  'genres': 'Children|Fantasy|Musical'},
 {'key': 2168, 'value': 'Toys 1992', 'genres': 'Comedy|Fantasy'},
 {'key': 2395, 'value': 'Dry Cleaning Nettoyage  sec 1997', 'genres': 'Drama'},
 {'key': 2999,
  'value': 'March Wooden Soldiers aka Babes Toyland 1934',
  'genres': 'Children|Comedy|Fantasy'}]



```
[   {
		'key': 0,
		'value': 'Toy Story 1995',
		'genres': 'Adventure|Animation|Children|Comedy|Fantasy'
	},
	{
		'key': 1933,
		'value': 'Babes Toyland 1961',
		'genres': 'Children|Fantasy|Musical'
	},
	{
		'key': 2168,
		'value': 'Toys 1992',
		'genres': 'Comedy|Fantasy'
	},
	{
		'key': 2395,
		'value': 'Dry Cleaning Nettoyage  sec 1997',
		'genres': 'Drama'
	},
	{
		'key': 2999,
		'value': 'March Wooden Soldiers aka Babes Toyland 1934',
		'genres': 'Children|Comedy|Fantasy'
	}
]
```



In [3]:
recommendation.recommend_movie('Toy Story 1995',20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['Similarity_Score'] = selected_course_score


Unnamed: 0,title,genres,movieId,similar
24460,Toy Story Toons: Small Fry (2011),Adventure|Animation|Children|Comedy|Fantasy,115879,1.0
24458,Toy Story Toons: Hawaiian Vacation (2011),Adventure|Animation|Children|Comedy|Fantasy,115875,1.0
3027,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3114,1.0
15401,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,78499,0.945055
12,Balto (1995),Adventure|Animation|Children,13,0.788732
1,Jumanji (1995),Adventure|Children|Fantasy,2,0.753623
21981,Toy Story of Terror (2013),Animation|Children|Comedy,106022,0.735294
25461,Toy Story Toons: Partysaurus Rex (2012),Animation|Children|Comedy,120468,0.735294
25463,Toy Story That Time Forgot (2014),Animation|Children,120474,0.590164
9,GoldenEye (1995),Action|Adventure|Thriller,10,0.411765


In [12]:
recommendation.autocomplete("space")

[{'key': 65,
  'value': 'Lawnmower Man 2 Cyberspace 1996',
  'genres': 'Action|Sci-Fi|Thriller'},
 {'key': 664,
  'value': 'Space Jam 1996',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy|Sci-Fi'},
 {'key': 907,
  'value': '2001 Space Odyssey 1968',
  'genres': 'Adventure|Drama|Sci-Fi'},
 {'key': 1750,
  'value': 'Lost Space 1998',
  'genres': 'Action|Adventure|Sci-Fi'},
 {'key': 1840, 'value': 'Plan 9 Outer Space 1959', 'genres': 'Horror|Sci-Fi'}]

In [14]:
recommendation.recommend_movie('2001 Space Odyssey 1968',20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_df['Similarity_Score'] = selected_course_score


Unnamed: 0,title,genres,movieId,similar
665,Barbarella (1968),Adventure|Comedy|Sci-Fi,674,0.8
14113,Space Odyssey: Voyage to the Planets (2004),Documentary|Drama|Sci-Fi,70828,0.73913
3958,Antitrust (2001),Crime|Drama|Thriller,4052,0.47619
5364,Me Without You (2001),Comedy|Drama,5461,0.411765
1867,Oliver! (1968),Drama|Musical,1951,0.4
3962,"Pledge, The (2001)",Crime|Drama|Mystery|Thriller,4056,0.4
2200,If.... (1968),Drama,2285,0.37037
4649,O (2001),Drama,4745,0.37037
11903,Go (2001),Drama,53406,0.37037
690,Faces (1968),Drama,702,0.37037
