In [1]:
import pandas as pd
import numpy as np
import os
import re
import json
import glob as glob

In [2]:
DATA_PATH = 'C:/Users/Warrior/Code/Data/data/movielens'

In [3]:
glob.glob(DATA_PATH + '/*.csv')

['C:/Users/Warrior/Code/Data/data/movielens\\links.csv',
 'C:/Users/Warrior/Code/Data/data/movielens\\movies.csv',
 'C:/Users/Warrior/Code/Data/data/movielens\\ratings.csv',
 'C:/Users/Warrior/Code/Data/data/movielens\\tags.csv']

In [4]:
FILES = glob.glob(DATA_PATH + '/*.csv')

In [5]:
df = dict()
for file in FILES:
    name = file[file.rindex('\\') + 1:file.rindex('.') - 1]
    df[name] = pd.read_csv(file,encoding='utf-8')

In [6]:
df

{'link':       movieId   imdbId    tmdbId
 0           1   114709     862.0
 1           2   113497    8844.0
 2           3   113228   15602.0
 3           4   114885   31357.0
 4           5   113041   11862.0
 ...       ...      ...       ...
 9120   162672  3859980  402672.0
 9121   163056  4262980  315011.0
 9122   163949  2531318  391698.0
 9123   164977    27660  137608.0
 9124   164979  3447228  410803.0
 
 [9125 rows x 3 columns],
 'movie':       movieId                                              title  \
 0           1                                   Toy Story (1995)   
 1           2                                     Jumanji (1995)   
 2           3                            Grumpier Old Men (1995)   
 3           4                           Waiting to Exhale (1995)   
 4           5                 Father of the Bride Part II (1995)   
 ...       ...                                                ...   
 9120   162672                                Mohenjo Daro (2016

In [7]:
df.keys()

dict_keys(['link', 'movie', 'rating', 'tag'])

In [8]:
joined = pd.merge(df['movie'], df['link'], on='movieId')

In [9]:
df_avg_ratings = df['rating'].groupby('movieId').agg(np.mean)

In [10]:
df_avg_ratings

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,338.558704,3.872470,1.103116e+09
2,318.906542,3.401869,1.069321e+09
3,374.423729,3.161017,9.662429e+08
4,355.538462,2.384615,9.277797e+08
5,320.785714,3.267857,9.967201e+08
...,...,...,...
161944,287.000000,5.000000,1.470168e+09
162376,73.000000,4.500000,1.474256e+09
162542,611.000000,5.000000,1.471521e+09
162672,611.000000,3.000000,1.471524e+09


In [11]:
joined = pd.merge(joined, df_avg_ratings, on='movieId')

In [12]:
df['link'].columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [26]:
df['tag'].columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [27]:
df['tag'].sample(10)

Unnamed: 0,userId,movieId,tag,timestamp
311,346,7022,explosions,1159734326
522,364,115617,pixar,1444530397
390,364,1732,Jeff Bridges,1444535201
44,132,27255,No progress,1283581045
358,364,318,narrated,1444529829
616,431,1250,war,1140455073
667,431,6281,collin farrel,1140455383
819,531,35836,funny,1243454588
967,547,7792,holes70s,1342849103
59,138,4226,psychology,1440380115


In [13]:
joined = pd.merge(joined,df['link'], on='movieId')

In [14]:
joined.columns

Index(['movieId', 'title', 'genres', 'imdbId_x', 'tmdbId_x', 'userId',
       'rating', 'timestamp', 'imdbId_y', 'tmdbId_y'],
      dtype='object')

In [15]:
joined.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            9056, 9057, 9058, 9059, 9060, 9061, 9062, 9063, 9064, 9065],
           dtype='int64', length=9066)

In [16]:
joined.sample(2)

Unnamed: 0,movieId,title,genres,imdbId_x,tmdbId_x,userId,rating,timestamp,imdbId_y,tmdbId_y
6224,37741,Capote (2005),Crime|Drama,379725,398.0,428.0,3.921053,1246055000.0,379725,398.0
5842,26776,Porco Rosso (Crimson Pig) (Kurenai no buta) (1...,Adventure|Animation|Comedy|Fantasy|Romance,104652,11621.0,350.5,3.916667,1393564000.0,104652,11621.0


In [17]:
def parse_name_year(string):
    PATTERN = re.compile(r'\([0-9]{4}\)')
    found = re.search(PATTERN, string)
    if found:
        name = string[:found.start() - 1]
        year = string[found.start() + 1:found.end() - 1]
        return name.strip(), int(year)
    else:
        return string.strip(), None    

In [18]:
parse_name_year('Hide and Seek (2005)')

('Hide and Seek', 2005)

In [19]:
parse_name_year('Hide and Seek')

('Hide and Seek', None)

In [20]:
joined['genres'] = joined['genres'].apply(lambda s : s.lower())

In [21]:
knowledge_base = {'movie':[]}
for idx in joined.index:
    movie = {}
    name,year = parse_name_year(joined['title'][idx])
    movie['name'] = name
    if year:
        movie['year'] = year
    movie['id'] = int(joined['movieId'][idx])
    movie['rating'] = float(joined['rating'][idx])
    movie['imdb'] = int(joined['imdbId_x'][idx])
    movie['genre'] = joined['genres'][idx].split('|')
    knowledge_base['movie'].append(movie)

In [22]:
with open('knowledge_data.json','w', encoding='utf-8') as fp:
    fp.write(json.dumps(knowledge_base, indent=4))

In [23]:
with open('movies.txt','w', encoding='utf-8') as fp:
    movie_list = [m['name'] for m in knowledge_base['movie']]
    fp.write('\n'.join(movie_list))

In [24]:
with open('genres.txt','w', encoding='utf-8') as fp:
    genre_list = set()
    for m in knowledge_base['movie']:
        for g in m['genre']:
            genre_list.add(g)
    fp.write('\n'.join(list(genre_list)))