In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


/Users/chase/Masters/csml/final_project


In [85]:
# Movie Preprocessing

# imports
import pandas as pd
import math
import shutil
# Dataset locations
movielens_path = "ml-32m/"
tmdb_path = "tmdb/"

# Grab both movies datasets
df_ml_movie = pd.read_csv(movielens_path+"movies.csv")
df_tmdb_movie = pd.read_csv(tmdb_path+"movies.csv")
df_posters = pd.read_csv("tmdb/posters.csv")
df_ratings = pd.read_csv(movielens_path+"ratings.csv", usecols=['userId', 'movieId', 'rating'])

# Process two dataframes to make titles in similar formats

def movie_with_year(row):
    year = row["date"]
    if math.isnan(year):
        year = "0000"
    else:
        year = str(int(year))
    return str(row['title']) + " (" + year + ")"

def name_to_title(row):
    return row['name']

def move_the_a(row):
    current_title = str(row['title'])
    year = current_title[-7:]
    index = current_title.find(", The")
    if index != -1:
        current_title = "The " + current_title[:index] + year
    index = current_title.find(", A")
    if index != -1:
        current_title = "A " + current_title[:index] + year
    index = current_title.find(", La")
    if index != -1:
        current_title = "La " + current_title[:index] + year
    index = current_title.find(", El")
    if index != -1:
        current_title = "El " + current_title[:index] + year
    index = current_title.find(", Les")
    if index != -1:
        current_title = "Les " + current_title[:index] + year
    return current_title

def seven(row):
    title = str(row["title"])
    index = title.find("Se7en) (1995)")
    if index != -1:
        title = "Se7en (1995)"
    return title

def remove_paren(row):
    title = str(row["title"])
    year = title[-6:]
    title_no_year = title[:-7]
    index = title_no_year.find("(")
    if index != -1:
        title = title_no_year[:index] + year
    return title

def tmdb_cleanup(row):
    # Star Wars Films
    title = str(row["title"])
    index = title.find("Star Wars (1977)")
    if index != -1:
        title = "Star Wars: Episode IV - A New Hope (1977)"
    index = title.find("The Empire Strikes Back")
    if index != -1:
        title = "Star Wars: Episode V - The Empire Strikes Back (1980)"
    index = title.find("Return of the Jedi")
    if index != -1:
        title = "Star Wars: Episode VI - Return of the Jedi (1983)"
    index = title.find("Empire of Dreams: The Story of the Star Wars Trilogy")
    if index != -1:
        title = "Empire of Dreams: The Story of the 'Star Wars' Trilogy (2004)"

    # Harry Potter
    index = title.find("Harry Potter and the Philosopher's Stone")
    if index != -1:
        title = "Harry Potter and the Sorcerer's Stone (2001)"

    # X-Men
    index = title.find("X2 (2003)")
    if index != -1:
        title = "X2: X-Men United (2003)"

    index = title.find("The 40 Year Old Virgin (2005)")
    if index != -1:
        title = "The 40-Year-Old Virgin (2005)"

    index = title.find("Wallace & Gromit: The Curse of the Were-Rabbit (2005)")
    if index != -1:
        title = "Wallace & Gromit in The Curse of the Were-Rabbit (2005)"

    index = title.find("The Hitchhiker's Guide to the Galaxy (1981)")
    if index != -1:
        title = "The Hitch Hikers Guide to the Galaxy (1981)"

    return title

def ml_cleanup(row):
    title = str(row["title"])
    index = title.find("Harry Potter and the Philosopher's Stone")
    if index != -1:
        title = "Harry Potter and the Sorcerer's Stone (2001)"
    return title

# Format TMDB data
df_tmdb_movie["title"] = df_tmdb_movie.apply(name_to_title, axis=1)
df_tmdb_movie["title"] = df_tmdb_movie.apply(movie_with_year, axis=1)
df_tmdb_movie["title"] = df_tmdb_movie.apply(tmdb_cleanup, axis=1)
df_tmdb_movie.drop("name", axis=1, inplace=True)

# Format MovieLens Data
df_ml_movie["title"] = df_ml_movie.apply(move_the_a, axis=1)
df_ml_movie["title"] = df_ml_movie.apply(seven, axis=1)
df_ml_movie["title"] = df_ml_movie.apply(ml_cleanup, axis=1)
df_ml_movie["title"] = df_ml_movie.apply(remove_paren, axis=1)

# Format Posters Data
def change_id_format(row):
    return row['id']

df_posters['tmdb_id'] = df_posters.apply(change_id_format, axis=1)
df_posters.drop("id", axis=1, inplace=True)

# Ratings Counts for popular movies later on 
counts = df_ratings['movieId'].value_counts().reset_index()
counts['ml_movieId'] = counts['movieId']
counts['ml_rating_count'] = counts['count']
counts.drop("movieId", axis=1, inplace=True)
counts.drop("count", axis=1, inplace=True)

# rename columns to differentiate the two datasets
df_ml_movie.rename(columns=
 {'movieId': 'ml_movieId',
  'genres' : 'ml_genres'
 }, inplace=True)
df_tmdb_movie.rename(columns=
 {'id': 'tmdb_id',
  'date' : 'tmdb_date',
  'tagline' : 'tmdb_tagline',
  'description' : 'tmdb_description',
  'minute': 'tmdb_minute',
  'rating': 'tmdb_rating',
  }, inplace=True)

df_movies = pd.merge(df_ml_movie, df_tmdb_movie, on='title', how='left')
df_movies = df_movies.drop_duplicates(subset='ml_movieId', keep='first')
df_movies = pd.merge(df_movies, df_posters, on='tmdb_id', how='left')
df_movies = pd.merge(df_movies, counts, on="ml_movieId", how='left')

counts = df_ratings['movieId'].value_counts()
# df_movies
df_movies.to_csv('movie_info.csv', index=False)

# Minimize data
counts = df_ratings['movieId'].value_counts().reset_index()
counts['ml_rating_count'] = counts['count']
counts.drop("count", axis=1, inplace=True)

df_ratings = pd.merge(df_ratings, counts, on="movieId", how='left')
df_ratings_min = df_ratings.loc[df_ratings['ml_rating_count'] >= 50]

df_movies_min = df_movies.loc[df_movies['ml_rating_count'] >= 50]

print("Movie Info Count Before Low Review Removal:",df_movies['ml_movieId'].nunique())
print("Movie Info Count After Low Review Removal:",df_movies_min['ml_movieId'].nunique())

print("Rating Movie Count Before Low Review Removal:",df_ratings['movieId'].nunique())
print("Rating Movie Count After Low Review Removal:",df_ratings_min['movieId'].nunique())

df_ratings_min.drop("ml_rating_count", axis=1, inplace=True)

df_ratings_min.to_csv("ml32_rating.csv", index=False)
df_movies_min.to_csv("ml32_movie_info.csv", index=False)

shutil.copyfile('ml32_movie_info.csv', 'django/mynextmovienet/web/ml32_movie_info.csv')
shutil.copyfile('ml32_rating.csv', 'django/mynextmovienet/web/ml32_rating.csv')

df_genres = df_movies[['ml_movieId','ml_genres']]
df_genres['movieId'] = df_movies['ml_movieId']
df_genres.drop("ml_movieId", axis=1, inplace=True)
df_ratings_min = pd.merge(df_ratings_min, df_genres, on="movieId", how="left")
df_ratings_min.to_csv("ml32_rating_genre.csv", index=False)

Movie Info Count Before Low Review Removal: 87585
Movie Info Count After Low Review Removal: 16034
Rating Movie Count Before Low Review Removal: 84432
Rating Movie Count After Low Review Removal: 16034


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ratings_min.drop("ml_rating_count", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres['movieId'] = df_movies['ml_movieId']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres.drop("ml_movieId", axis=1, inplace=True)


In [48]:
df_posters = pd.read_csv("tmdb/posters.csv")

def change_id_format(row):
    return row['id']

df_posters['tmdb_id'] = df_posters.apply(change_id_format, axis=1)
df_posters.drop("id", axis=1, inplace=True)
df_posters

Unnamed: 0,link,tmdb_id
0,https://a.ltrbxd.com/resized/film-poster/2/7/7...,1000001
1,https://a.ltrbxd.com/resized/film-poster/4/2/6...,1000002
2,https://a.ltrbxd.com/resized/film-poster/4/7/4...,1000003
3,https://a.ltrbxd.com/resized/film-poster/5/1/5...,1000004
4,https://a.ltrbxd.com/resized/film-poster/2/4/0...,1000005
...,...,...
941592,,1941593
941593,,1941594
941594,https://a.ltrbxd.com/resized/film-poster/1/1/8...,1941595
941595,https://a.ltrbxd.com/resized/film-poster/1/1/8...,1941596


In [51]:
df_movies = pd.read_csv("movie_info.csv")
df_movies

Unnamed: 0,ml_movieId,title,ml_genres,tmdb_id,tmdb_date,tmdb_tagline,tmdb_description,tmdb_minute,tmdb_rating,link
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1000134.0,1995.0,,"Led by Woody, Andy's toys live happily in his ...",81.0,4.12,https://a.ltrbxd.com/resized/film-poster/5/1/2...
1,2,Jumanji (1995),Adventure|Children|Fantasy,1000695.0,1995.0,Roll the dice and unleash the excitement!,When siblings Judy and Peter discover an encha...,104.0,3.61,https://a.ltrbxd.com/resized/sm/upload/7j/8o/r...
2,3,Grumpier Old Men (1995),Comedy|Romance,1014477.0,1995.0,Still Yelling. Still Fighting. Still Ready for...,A family wedding reignites the ancient feud be...,101.0,3.11,https://a.ltrbxd.com/resized/film-poster/4/2/2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1008798.0,1995.0,Friends are the people who let you be yourself...,"Cheated on, mistreated and stepped on, the wom...",127.0,3.35,https://a.ltrbxd.com/resized/film-poster/3/1/2...
4,5,Father of the Bride Part II (1995),Comedy,1007066.0,1995.0,Just when his world is back to normal... he's ...,Just when George Banks has recovered from his ...,106.0,3.10,https://a.ltrbxd.com/resized/film-poster/4/4/9...
...,...,...,...,...,...,...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,,,,,,,
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,1102539.0,2002.0,,Two friends who are doing civil service flee t...,83.0,2.54,https://a.ltrbxd.com/resized/film-poster/4/7/3...
27275,131258,The Pirates (2014),Adventure,1022607.0,2014.0,,"At the cusp of the founding of Joseon Dynasty,...",130.0,3.06,https://a.ltrbxd.com/resized/film-poster/2/1/0...
27276,131260,Rentun Ruusu (2001),(no genres listed),,,,,,,


In [58]:
df_movies = pd.read_csv("movie_info.csv")
df_movie_min = df_movies.loc[df_movies['tmdb_rating'] >= 3]
df_movie_min.dropna(inplace=True)
df_movie_min
movie_sample = df_movie_min.sample(n=5)
print(movie_sample)

       ml_movieId                          title             ml_genres  \
11261       47707               My Geisha (1962)                Comedy   
10724       42546     Passport to Pimlico (1949)                Comedy   
19094       94974  Superman vs. The Elite (2012)      Action|Animation   
17553       88339          The Four Times (2010)                 Drama   
2092         2176                    Rope (1948)  Crime|Drama|Thriller   

         tmdb_id  tmdb_date  \
11261  1063473.0     1962.0   
10724  1020009.0     1949.0   
19094  1012020.0     2012.0   
17553  1011941.0     2010.0   
2092   1001017.0     1948.0   

                                            tmdb_tagline  \
11261  It's the WACKIEST HOAX That Ever Turned the Sc...   
10724           French Goings-On in the Heart of London.   
19094                        World saved. Humanity lost.   
17553                 Human. Animal. Vegetable. Mineral.   
2092                       The guest who's dead on time.   

       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie_min.dropna(inplace=True)


In [7]:
df_train = pd.read_csv("model/train.csv")
df_test = pd.read_csv("model/test.csv")
df_train

Unnamed: 0,userId,movieId,rating
0,90232,2657,2.0
1,125622,327,1.5
2,36291,44191,3.5
3,111029,2763,0.5
4,113449,832,4.0
...,...,...,...
16000205,18706,4029,2.0
16000206,116198,106918,4.5
16000207,31995,2082,4.0
16000208,119046,1961,4.0


In [50]:
import numpy as np
import pandas as pd

df_ratings = pd.read_csv("min_rating.csv")
number_of_splits = 5
for i, df in enumerate(np.array_split(df_ratings, number_of_splits)):
    df.to_csv(f"min_rating_{i + 1}.csv", index=False)
    shutil.copyfile(f"min_rating_{i + 1}.csv", f'django/mynextmovienet/web/min_rating_{i + 1}.csv')

  return bound(*args, **kwds)


In [49]:
import pandas as pd
import glob
import os

df_list = []

csv_files = glob.glob('min_rating_[1-5].csv')
for file in sorted(csv_files):
    df = pd.read_csv(file)
    df_list.append(df)
df_ratings = pd.concat(df_list, ignore_index=True)
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
19847942,138493,68954,4.5
19847943,138493,69526,4.5
19847944,138493,69644,3.0
19847945,138493,70286,5.0


In [46]:
# Braveheart, Terminator 2, Judgement Day, Jurassic Park, Mission:Impossible
print("Action/Adventure")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --genre --prefs "110,589,2571,480,648"
# The Cable Guy, The Nutty Professor, Father of the Bride, Monty Python's Life of Brian, Weekend at Bernie's
print("Comedy")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --genre --prefs "784,788,934,1080,1091"
# Pride and Prejudice, The Miracle Worker, Ray, Great Expectations, Grand Prix
print("Drama")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --genre --prefs "7669,7619,8958,25923,26153"
# Toy Story, Winnie the Pooh and the Blustery Day, Lemonade Mouth, The Baby-Sitters Club, Cinderella
print("Children's")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --genre --prefs "1,1023,132012,343,258671"

Action/Adventure
---------------------------------
True
Number of users: 200947
Number of movies: 16034
Number of genres: 20

Preference: 
 Movie IDs: [110, 589, 2571, 480, 648]
Movie Names: ['Braveheart (1995)', 'Jurassic Park (1993)', 'Terminator 2: Judgment Day (1991)', 'Mission: Impossible (1996)', 'The Matrix (1999)']
Genres: ['Action|Drama|War', 'Action|Adventure|Sci-Fi|Thriller', 'Action|Sci-Fi', 'Action|Adventure|Mystery|Thriller', 'Action|Sci-Fi|Thriller']

Recommendation: 
Movie IDs: [128912, 27843, 172793, 1203, 151769]
Movie names: ['12 Angry Men (1957)', 'Machuca (2004)', "Mike Birbiglia: My Girlfriend's Boyfriend (2013)", 'Three from Prostokvashino (1978)', 'Vovka in the Kingdom of Far Far Away (1965)']
Genres: ['Drama', 'Drama', 'Comedy', 'Animation', 'Adventure|Animation|Children|Fantasy']
Comedy
---------------------------------
True
Number of users: 200947
Number of movies: 16034
Number of genres: 20

Preference: 
 Movie IDs: [784, 788, 934, 1080, 1091]
Movie Names: [

In [47]:
# Braveheart, Terminator 2, Judgement Day, Jurassic Park, Mission:Impossible
print("Action/Adventure")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --no-genre --prefs "110,589,2571,480,648"
# The Cable Guy, The Nutty Professor, Father of the Bride, Monty Python's Life of Brian, Weekend at Bernie's
print("Comedy")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --no-genre --prefs "784,788,934,1080,1091"
# Pride and Prejudice, The Miracle Worker, Ray, Great Expectations, Grand Prix
print("Drama")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --no-genre --prefs "7669,7619,8958,25923,26153"
# Toy Story, Winnie the Pooh and the Blustery Day, Lemonade Mouth, The Baby-Sitters Club, Cinderella
print("Children's")
print("---------------------------------")
!python3 django/mynextmovienet/web/model.py predict --no-genre --prefs "1,1023,132012,343,258671"

Action/Adventure
---------------------------------
False
Number of users: 200947
Number of movies: 16034

Preference: 
 Movie IDs: [110, 589, 2571, 480, 648]
Movie Names: ['Braveheart (1995)', 'Jurassic Park (1993)', 'Terminator 2: Judgment Day (1991)', 'Mission: Impossible (1996)', 'The Matrix (1999)']
Genres: ['Action|Drama|War', 'Action|Adventure|Sci-Fi|Thriller', 'Action|Sci-Fi', 'Action|Adventure|Mystery|Thriller', 'Action|Sci-Fi|Thriller']

Recommendation: 
Movie IDs: [106452, 48268, 1283, 180591, 5167]
Movie names: ['High Noon (1952)', 'My Favorite Brunette (1947)', 'Empire Falls (2005)', 'Ida (2013)', 'Jane (2017)']
Genres: ['Drama|Western', 'Comedy|Mystery', 'Drama|Romance', 'Drama', 'Documentary']
Comedy
---------------------------------
False
Number of users: 200947
Number of movies: 16034

Preference: 
 Movie IDs: [784, 788, 934, 1080, 1091]
Movie Names: ['The Cable Guy (1996)', 'The Nutty Professor (1996)', 'Father of the Bride (1950)', "Monty Python's Life of Brian (1979)

In [48]:
!python3 django/mynextmovienet/web/model.py train --no-genre

False
Omitting genre information in training
tensors processed: 0.01 minutes
split processed: 0.03 minutes
train processed: 0.03 minutes
test processed: 0.03 minutes
Number of users: 200947
Number of movies: 16034
Finished preprocessing, entering training loop: 2.34 seconds
Epoch [1/20], Loss: 1.2215, Train MAE: 0.8586, Val Loss: 0.9663, Val MAE: 0.7863, Train Precision: 0.6500, Val Precision: 0.9000, Time: 12.6362 minutes
Epoch [2/20], Loss: 0.8483, Train MAE: 0.7081, Val Loss: 0.8090, Val MAE: 0.6906, Train Precision: 1.0000, Val Precision: 1.0000, Time: 32.9600 minutes
Epoch [3/20], Loss: 0.7615, Train MAE: 0.6659, Val Loss: 0.7844, Val MAE: 0.6777, Train Precision: 1.0000, Val Precision: 1.0000, Time: 48.8398 minutes
Epoch [4/20], Loss: 0.7383, Train MAE: 0.6540, Val Loss: 0.7870, Val MAE: 0.6815, Train Precision: 1.0000, Val Precision: 1.0000, Time: 68.8130 minutes
Epoch [5/20], Loss: 0.7293, Train MAE: 0.6493, Val Loss: 0.7880, Val MAE: 0.6821, Train Precision: 1.0000, Val Precis

In [69]:
import pandas as pd
df_ratings = pd.read_csv("movielens/rating.csv", usecols=['userId', 'movieId', 'rating'])
df_ratings.to_csv("movielens/rating.csv", index=False)

In [46]:
# preferred_movies_ids = [1, 48, 158, 239, 3114]

!python3 model/predict.py "1,48,158,239,3114"

Recommended movie IDs: [27002, 1269, 364, 71348, 75823, 7403, 90809, 2580, 27239]


In [28]:
import pandas as pd

df_ratings = pd.read_csv("movielens/rating.csv", usecols=['userId', 'movieId', 'rating'])

print(df_ratings.iloc[176])
print(df_ratings['movieId'].iloc[176])
print(sorted(df_ratings.movieId.unique()))
# movie_ids = df_ratings['movieId'].unique().tolist()
# print(len(movie_ids))
# print(movie_ids)

# df_ratings['movieId'] = df_ratings['movieId'].astype('category').cat.codes
# num_users = df_ratings['userId'].nunique()
# num_movies = df_ratings['movieId'].nunique()

# movie_ids = df_ratings['movieId'].unique().tolist()
# print(num_movies)
# print(len(movie_ids))
# print(movie_ids)

userId      2.0
movieId    62.0
rating      5.0
Name: 176, dtype: float64
62
[np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int6

All the papers are on arxiv, so googling with that, and lots of the newer papers reference the older ones
Paperwithcode is also good 

In [99]:
# [27002, 1269, 364, 71348, 75823, 7403, 90809, 2580, 27239]
df_movie_info = pd.read_csv("movie_info.csv", index_col=False)
movie_info = df_movie_info.loc[df_movie_info['ml_movieId'] == 27002]

info_dict = {}

for i, info in enumerate(movie_info):
    info_dict[info] = movie_info.values[0][i]
# for info in movie_info.values[0]:
#     print(info)
print(info_dict)

{'ml_movieId': 27002, 'title': 'From the Earth to the Moon (1998)', 'ml_genres': 'Action|Documentary|Drama|Thriller', 'tmdb_id': 1025905.0, 'tmdb_date': 1998.0, 'tmdb_tagline': nan, 'tmdb_description': "The story of the United States' space program, from its beginnings in 1961 to the final moon mission in 1972.", 'tmdb_minute': 641.0, 'tmdb_rating': 4.06, 'link': 'https://a.ltrbxd.com/resized/film-poster/3/1/5/7/4/31574-from-the-earth-to-the-moon-0-230-0-345-crop.jpg?v=7785ab7894'}


In [106]:
df_movies = pd.read_csv("movie_info.csv")
popular_movies = df_movies.sort_values(by='tmdb_rating', ascending=False)
popular_movies.to_dict('records')

[{'ml_movieId': 7502,
  'title': 'Band of Brothers (2001)',
  'ml_genres': 'Action|Drama|War',
  'tmdb_id': 1001983.0,
  'tmdb_date': 2001.0,
  'tmdb_tagline': 'There was a time when the world asked ordinary men to do extraordinary things.',
  'tmdb_description': "Drawn from interviews with survivors of Easy Company, as well as their journals and letters, Band of Brothers chronicles the experiences of these men from paratrooper training in Georgia through the end of the war. As an elite rifle company parachuting into Normandy early on D-Day morning, participants in the Battle of the Bulge, and witness to the horrors of war, the men of Easy knew extraordinary bravery and extraordinary fear - and became the stuff of legend. Based on Stephen E. Ambrose's acclaimed book of the same name.",
  'tmdb_minute': 594.0,
  'tmdb_rating': 4.69,
  'link': 'https://a.ltrbxd.com/resized/film-poster/2/5/9/0/1/1/259011-band-of-brothers-0-230-0-345-crop.jpg?v=267d271582'},
 {'ml_movieId': 26082,
  'title

In [156]:
df_ratings = pd.read_csv("movielens/rating.csv", usecols=['userId', 'movieId', 'rating'])
counts = df_ratings['movieId'].value_counts().reset_index()
counts['ml_movieId'] = counts['movieId']
counts['ml_rating_count'] = counts['count']
counts.drop("movieId", axis=1, inplace=True)
counts.drop("count", axis=1, inplace=True)
counts.drop("ml_movieId", axis=1, inplace=True)
counts

Unnamed: 0,ml_rating_count
0,67310
1,66172
2,63366
3,63299
4,59715
...,...
26739,1
26740,1
26741,1
26742,1


In [None]:
import pandas as pd
import shutil
df_movies = pd.read_csv("movie_info.csv")
df_ratings = pd.read_csv("movielens/rating.csv", usecols=['userId', 'movieId', 'rating'])

counts = df_ratings['movieId'].value_counts().reset_index()
counts['ml_rating_count'] = counts['count']
counts.drop("count", axis=1, inplace=True)

df_ratings = pd.merge(df_ratings, counts, on="movieId", how='left')
df_ratings_min = df_ratings.loc[df_ratings['ml_rating_count'] >= 50]

df_movies_min = df_movies.loc[df_movies['ml_rating_count'] >= 50]

print("Movie Info Count Before Low Review Removal:",df_movies['ml_movieId'].nunique())
print("Movie Info Count After Low Review Removal:",df_movies_min['ml_movieId'].nunique())

print("Rating Movie Count Before Low Review Removal:",df_ratings['movieId'].nunique())
print("Rating Movie Count After Low Review Removal:",df_ratings_min['movieId'].nunique())

df_ratings_min.drop("ml_rating_count", axis=1, inplace=True)

df_ratings_min.to_csv("min_rating.csv", index=False)
df_movies_min.to_csv("min_movie_info.csv", index=False)

shutil.copyfile('min_movie_info.csv', 'django/mynextmovienet/web/min_movie_info.csv')
shutil.copyfile('min_rating.csv', 'django/mynextmovienet/web/min_rating.csv')

Movie Info Count Before Low Review Removal: 27278
Movie Info Count After Low Review Removal: 10524
Rating Movie Count Before Low Review Removal: 26744
Rating Movie Count After Low Review Removal: 10524


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ratings_min.drop("ml_rating_count", axis=1, inplace=True)


'django/mynextmovienet/web/min_movie_info.csv'

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
df_movie_info = pd.read_csv("min_movie_info.csv")
df_ratings = pd.read_csv("min_rating.csv")
genre_list={}
for genres in df_movie_info['ml_genres']:
    all_genres = genres.split("|")
    for genre in all_genres:
        if genre not in genre_list.keys():
            genre_list[genre] = 1
        else:
            genre_list[genre] += 1
        
print(genre_list)

df_genres = df_movie_info[['ml_movieId','ml_genres']]
df_genres['movieId'] = df_movie_info['ml_movieId']
df_genres.drop("ml_movieId", axis=1, inplace=True)
df_ratings = pd.merge(df_ratings, df_genres, on="movieId", how="left")
df_genres = df_ratings[['ml_genres']]
df_ratings.drop("ml_genres", axis=1, inplace=True)
df_genres['ml_genres'] = df_genres['ml_genres'].str.split('|')
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df_genres['ml_genres'])
genre_tensor = torch.tensor(genre_encoded, dtype=torch.float)
# df_ratings = pd.concat([df_ratings, df_genre], axis=1)

# df_ratings

genre_tensor

{'Adventure': 1190, 'Animation': 435, 'Children': 628, 'Comedy': 3773, 'Fantasy': 706, 'Romance': 1801, 'Drama': 5157, 'Action': 1722, 'Crime': 1277, 'Thriller': 1997, 'Horror': 1038, 'Mystery': 632, 'Sci-Fi': 901, 'IMAX': 163, 'Documentary': 487, 'War': 468, 'Musical': 416, 'Western': 228, 'Film-Noir': 118}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres['movieId'] = df_movie_info['ml_movieId']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres.drop("ml_movieId", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres['ml_genres'] = df_genres['ml_genres'].str.split('|')


tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [25]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.nn.utils.rnn import pad_sequence

def process_genre(genre_string, genre_to_id):
    genre_ids = [genre_to_id[genre] for genre in genre_string.split('|')]
    return torch.tensor(genre_ids, dtype=torch.long)

df_ratings = pd.read_csv("min_rating.csv")
df_movie_info = pd.read_csv("min_movie_info.csv")
# df_movie_info['ml_genres'] = df_movie_info['ml_genres'].str.split('|')
# print(df_movie_info.loc[df_movie_info['ml_movieId'] == 2571].index)
# mlb = MultiLabelBinarizer()
# movie_genre_encoded = mlb.fit_transform(df_movie_info['ml_genres'])
# movie_genre_tensor = torch.tensor(movie_genre_encoded, dtype=torch.float)

# df_genres = df_movie_info[['ml_movieId','ml_genres']]
# df_genres['movieId'] = df_movie_info['ml_movieId']
# df_genres.drop("ml_movieId", axis=1, inplace=True)
# df_ratings = pd.merge(df_ratings, df_genres, on="movieId", how="left")

unique_genres = sorted(set(g for genres in df_movie_info['ml_genres'] for g in genres.split('|')))
genre_to_id = {genre: idx for idx, genre in enumerate(unique_genres)}
# len(movie_genre_tensor)

# 2571 - The Matrix
# 2572
# movie_genre_tensor[2571]

genre_string = "Thriller|Drama"
genre_ids = [genre_to_id[genre] for genre in genre_string.split('|')]
print(genre_ids)
genre_tensor = [process_genre(genres, genre_to_id) for genres in df_movie_info['ml_genres']]
# genre_tensor = torch.tensor(genre_encoded, dtype=torch.long)
padded_genre_tensor = pad_sequence(genre_tensor, batch_first=True, padding_value=0)
padded_genre_tensor[1]


[16, 7]


tensor([1, 3, 8, 0, 0, 0, 0, 0, 0, 0])

In [287]:
def process_genre(genre_string, genre_to_id):
    genre_ids = [genre_to_id[genre] for genre in genre_string.split('|')]
    return torch.tensor(genre_ids, dtype=torch.long)

df_movie_info = pd.read_csv("min_movie_info.csv")
df_genres = df_movie_info[['ml_genres']]
df_genres['ml_genres'] = df_genres['ml_genres'].str.split('|')
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df_genres['ml_genres'])
len(genre_encoded[0])
genre_encoded = [process_genre(genres, genre_to_id) for genres in df_ratings['ml_genres']]
genre_tensor = torch.tensor(genre_encoded, dtype=torch.long)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_genres['ml_genres'] = df_genres['ml_genres'].str.split('|')


KeyError: 'ml_genres'

In [55]:
dr_ratings = pd.read_csv("min_rating.csv")
df_ratings.count()


userId     19847947
movieId    19847947
rating     19847947
dtype: int64