Preprocessing of movie metadata

Data sourced from MovieLens (https://grouplens.org/datasets/movielens/1m/)

In [5]:
import pandas as pd

df = pd.read_csv('./the-movies-dataset/movies_metadata.csv',low_memory=False)
print(df.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [10]:
# We have movie embeddings from a neural network trained on user ratings but they use a different ID system
# 'links.csv' contains the mappings from the ID system used in 'movies_metadata' to that of the embeddings
# we create a dictionary to map between the two
import pickle
with open('G:/My Drive/DublinAI/Mini Projects/chatbot/nn/embeddings_smaller', 'rb') as file:
    embed, movie_to_index = pickle.load(file)
mappings = pd.read_csv('./the-movies-dataset/links.csv',low_memory=False)
mappings = mappings.drop_duplicates(subset=['tmdbId'],keep='first')
mappings.set_index('tmdbId',inplace=True)
to_movieId = mappings.to_dict()['movieId'] 
print('Dictionary: ')
print(to_movieId)

Dictionary: 
{862.0: 1, 8844.0: 2, 15602.0: 3, 31357.0: 4, 11862.0: 5, 949.0: 6, 11860.0: 7, 45325.0: 8, 9091.0: 9, 710.0: 10, 9087.0: 11, 12110.0: 12, 21032.0: 13, 10858.0: 14, 1408.0: 15, 524.0: 16, 4584.0: 17, 5.0: 18, 9273.0: 19, 11517.0: 20, 8012.0: 21, 1710.0: 22, 9691.0: 23, 12665.0: 24, 451.0: 25, 16420.0: 26, 9263.0: 27, 17015.0: 28, 902.0: 29, 37557.0: 30, 9909.0: 31, 63.0: 32, 78802.0: 33, 9598.0: 34, 47018.0: 35, 687.0: 36, 139405.0: 37, 33689.0: 38, 9603.0: 39, 34615.0: 40, 31174.0: 41, 11443.0: 42, 35196.0: 43, 9312.0: 44, 577.0: 45, 11861.0: 46, 807.0: 47, 10530.0: 48, 8391.0: 49, 629.0: 50, 117164.0: 51, 11448.0: 52, 49133.0: 53, 26441.0: 54, 97406.0: 55, 124057.0: 56, 9089.0: 57, 11010.0: 58, 99040.0: 59, 11359.0: 60, 17182.0: 61, 2054.0: 62, 10607.0: 63, 19760.0: 64, 9536.0: 65, 11525.0: 66, 40628.0: 67, 4482.0: 68, 10634.0: 69, 755.0: 70, 11859.0: 71, 28387.0: 72, 48750.0: 73, 20927.0: 74, 36929.0: 75, 9102.0: 76, 124626.0: 77, 27526.0: 78, 9623.0: 79, 46785.0: 80, 4

In [7]:
import re
import ast   
def clean_text(text):
    token = re.sub(r'[\^\\,@\‘?!\.$%_:\-“’“”]', '', text, flags=re.I)
    return token

# drop fields that are not relevant for our application 
df = df[df['adult'] == 'FALSE']
df = df.drop(['adult','homepage','budget','runtime','release_date','original_language','production_countries','production_companies','spoken_languages','video','revenue','status','vote_count'],axis=1)
df = df.dropna(subset=['imdb_id','poster_path'])

# clean up text
print('Text clean up -')
print('Tagline before: ' + str(df['tagline'][1]))
df['tagline'] = df['tagline'].apply(lambda x: clean_text(str(x)).lower())
print('Tagline after: ' + str(df['tagline'][1]))
df['title'] = df['title'].apply(lambda x: clean_text(str(x)).lower())
df['original_title'] = df['original_title'].apply(lambda x: clean_text(str(x)).lower())
df['overview'] = df['overview'].apply(lambda x: clean_text(str(x)).lower())
print('Genres before: ' + str(df['genres'][1]))
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x)) #make into dictionary
df['genres'] = df['genres'].apply(lambda x: ', '.join([d['name'] for d in x]))
print('Genres after: ' + str(df['genres'][1]))
df['imdbURL'] = 'https://www.imdb.com/title/' + df['imdb_id'] + '/'
df['tmdbURL'] = 'https://www.themoviedb.org/movie/' + df['id']
df['ImageURL'] = 'https://image.tmdb.org/t/p/w92' + df['poster_path']
#ratings = pd.read_csv('./the-movies-dataset/ratings.csv',low_memory=False)
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('') 

Text clean up -
Tagline before: Roll the dice and unleash the excitement!
Tagline after: roll the dice and unleash the excitement
Genres before: [{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]
Genres after: Adventure, Fantasy, Family


In [None]:
# Add column of mapped IDs to dataframe
df = df.astype({'id':'int64'})
df['movieId'] = df['id'].map(to_movieId)
df['newId'] = df['movieId'].map(movie_to_index)
df = df.dropna(subset=['newId'])
df = df.reset_index()
df = df.drop(['index'],axis=1)
a = df['belongs_to_collection'][df['belongs_to_collection'].notnull()]
indices = list(a.index)
for i in indices:
    b = str(a[i]).split(", 'poster_path'")
    df['belongs_to_collection'][i]=ast.literal_eval(b[0]+'}').get('id')

df.to_csv('./the-movies-dataset/df_prep.csv')