## Preprocessing of Movie Metadata

Data sourced from MovieLens (https://grouplens.org/datasets/movielens/1m/)

In [12]:
import pandas as pd

df = pd.read_csv('./the-movies-dataset/movies_metadata.csv',low_memory=False)
print(df.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [19]:
# We have movie embeddings from a neural network trained on user ratings but they use a different ID system
# 'links.csv' contains the mappings from the ID system used in 'movies_metadata' to that of the embeddings
# we create a dictionary to map between the two
import pickle
with open('G:/My Drive/DublinAI/Mini Projects/chatbot/nn/embeddings_smaller', 'rb') as file:
    embed, movie_to_index = pickle.load(file)
mappings = pd.read_csv('./the-movies-dataset/links.csv',low_memory=False)
mappings = mappings.drop_duplicates(subset=['tmdbId'],keep='first')
mappings.set_index('tmdbId',inplace=True)
to_movieId = mappings.to_dict()['movieId'] 
print('tmdbId to movieId dictionary, first 2 pairs: ')
first2pairs = {k: to_movieId[k] for k in list(to_movieId)[:2]}
print(first2pairs)

tmdbId to movieId dictionary, first 2 pairs: 
{862.0: 1, 8844.0: 2}


In [7]:
import re
import ast   
def clean_text(text):
    token = re.sub(r'[\^\\,@\‘?!\.$%_:\-“’“”]', '', text, flags=re.I)
    return token

# drop fields that are not relevant for our application 
df = df[df['adult'] == 'FALSE']
df = df.drop(['adult','homepage','budget','runtime','release_date','original_language','production_countries','production_companies','spoken_languages','video','revenue','status','vote_count'],axis=1)
df = df.dropna(subset=['imdb_id','poster_path'])

# clean up text
print('Text clean up -')
print('Tagline before: ' + str(df['tagline'][1]))
df['tagline'] = df['tagline'].apply(lambda x: clean_text(str(x)).lower())
print('Tagline after: ' + str(df['tagline'][1]))
df['title'] = df['title'].apply(lambda x: clean_text(str(x)).lower())
df['original_title'] = df['original_title'].apply(lambda x: clean_text(str(x)).lower())
df['overview'] = df['overview'].apply(lambda x: clean_text(str(x)).lower())
print('Genres before: ' + str(df['genres'][1]))
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x)) #make into dictionary
df['genres'] = df['genres'].apply(lambda x: ', '.join([d['name'] for d in x]))
print('Genres after: ' + str(df['genres'][1]))
df['imdbURL'] = 'https://www.imdb.com/title/' + df['imdb_id'] + '/'
df['tmdbURL'] = 'https://www.themoviedb.org/movie/' + df['id']
df['ImageURL'] = 'https://image.tmdb.org/t/p/w92' + df['poster_path']
#ratings = pd.read_csv('./the-movies-dataset/ratings.csv',low_memory=False)
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('') 

Text clean up -
Tagline before: Roll the dice and unleash the excitement!
Tagline after: roll the dice and unleash the excitement
Genres before: [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]
Genres after: Adventure, Fantasy, Family


In [None]:
# Add column of mapped IDs to dataframe
df = df.astype({'id':'int64'})
df['movieId'] = df['id'].map(to_movieId)
df['newId'] = df['movieId'].map(movie_to_index)
df = df.dropna(subset=['newId'])
df = df.reset_index()
df = df.drop(['index'],axis=1)
a = df['belongs_to_collection'][df['belongs_to_collection'].notnull()]
indices = list(a.index)
for i in indices:
    b = str(a[i]).split(", 'poster_path'")
    df['belongs_to_collection'][i]=ast.literal_eval(b[0]+'}').get('id')

df.to_csv('./the-movies-dataset/df_prep.csv')