# MovieLens Database

ReadME notes here

In [84]:
import os
import pandas as pd
from tqdm import tqdm
import subprocess

NUMBER_USERS = 6040
NUMBER_MOVIES = 3952
THIS_FOLDER = os.getcwd()

### 1) Import Dataframes

In [58]:
def read_movies_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['MovieID', 'MovieTitle', 'MovieGenres'], engine='python')
    return DataFrame 

def read_users_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode'], engine='python')
    return DataFrame

def read_ratings_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
    return DataFrame 

def import_data(directory):
    movies_file = os.path.join(directory, 'movies.dat')
    users_file = os.path.join(directory, 'users.dat')
    ratings_file = os.path.join(directory, 'ratings.dat')

    movie_data_frame = read_movies_file(movies_file)
    user_data_frame = read_users_file(users_file)
    ratings_data_frame = read_ratings_file(ratings_file)

    return movie_data_frame, user_data_frame, ratings_data_frame

def shuffle_data_frame(data_frame, seed):
    pass

print('Importing files...')
movie_data_frame, user_data_frame, ratings_data_frame = import_data(THIS_FOLDER)

#Shuffle the dataframes
movie_data_frame = movie_data_frame.sample(frac=1)
user_data_frame = user_data_frame.sample(frac=1)
ratings_data_frame = ratings_data_frame.sample(frac=1)
print('Done!')

Importing files...
Done!
        UserID  MovieID  Rating  Timestamp
769588    4585     3418       4  964322037
126665     817     2396       5  975388066
894808    5404     1693       4  960344887
322251    1912     2686       4  974831897
141567     911     3301       4  975421812
...        ...      ...     ...        ...
143662     927      478       3  975195331
221241    1340      261       1  975128242
115651     750      912       4  975810973
971114    5852     1950       4  958101423
472216    2907     1266       4  984197700

[1000209 rows x 4 columns]


### 2) Settings

In [65]:
num_age_categories =  2                                                 #either 2 or 3
num_rating_categories = 2                                               #either 2 or 3
restrict_genres = True                                                  #if True then only allow movies of restricted genre
restricted_genres = ['Romance', 'Horror', 'Animation', 'Documentary']   
alternative_predicate_types = True                                      #if True then Gender(user, gender) instead of Male(user), Female(user) etc
                                                                        #similarly for user age and for movie genre
number_of_ratings = 1000                                                #integer less than 1000210
database_file_name = 'MovieLensMini.db'
databaseObj = pd.DataFrame(columns=['predicate', 'arguments'])

database_file = os.path.join(THIS_FOLDER, database_file_name)

### 3) Populate Banned Movies (if restricting genres)

In [98]:
def write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output):
    global databaseObj
    if not suppress_output and movieID in rated_movies:
        if alternative_predicate_types:
            file.write(f"Genre(M{movieID},{genre})\n")
            databaseObj = databaseObj.append({'predicate':'Genre', 'arguments':f"M{movieID},{genre}"}, ignore_index=True)
        else:
            file.write(f"{genre}(M{movieID})\n")
            databaseObj = databaseObj.append({'predicate':genre, 'arguments':f"M{movieID}"}, ignore_index=True)

def write_movie_to_file(row, file, suppress_output):
    movies_added = 0
    movieID = row['MovieID']
    genres = row['MovieGenres'].split('|')
    for genre in genres:
        if restrict_genres:
            if genre in restricted_genres:
                write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output)
                movies_added = 1
            else:
                banned_movies.add(movieID)
        else:
            write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output)
    
    return movies_added

def write_movie_ground_atoms_to_file(data_frame, file, suppress_output = False):
    number_of_movies = 0
    for index,row in data_frame.iterrows():
        movies_added = write_movie_to_file(row, file, suppress_output)
        number_of_movies += movies_added
                
def populate_banned_movies(data_frame):
    write_movie_ground_atoms_to_file(data_frame, None, suppress_output = True)
    

print('Constructing banned movie set...')
banned_movies = set()
populate_banned_movies(movie_data_frame)
print('Done!')

Constructing banned movie set...
Done!


### 4) Getting Ratings Ground Atoms

In [67]:
rated_movies = []
users_rating_movies = []
file = open(database_file, 'w')

def write_rating_predicate_to_file(rating, row, file):
    global databaseObj
    if num_rating_categories == 3:
        if rating >=4:
            rating = 'Positive'
        elif rating == 3:
            rating = 'Indifferent'
        elif rating <= 2:
            rating = 'Negative'
    elif num_rating_categories == 2:
        if rating >=4:
            rating = 'Positive'
        else:
            rating = 'Negative'
    else:
        raise ValueError('number_of_rating_categories ({}) not supported'.format(num_rating_categories))
    if restrict_genres:
        if row['MovieID'] not in banned_movies:
            rated_movies.append(row['MovieID'])
            users_rating_movies.append(row['UserID'])
            file.write(f"Rating(U{row['UserID']},M{row['MovieID']},{rating})\n")
            databaseObj = databaseObj.append({'predicate':'Rating', 'arguments':f"U{row['UserID']},M{row['MovieID']},{rating}"}, ignore_index=True)
            return 1
        else:
            return 0
    else:
        rated_movies.append(row['MovieID'])
        users_rating_movies.append(row['UserID'])
        file.write(f"Rating(U{row['UserID']},M{row['MovieID']},{rating})\n")
        databaseObj = databaseObj.append({'predicate':'Rating', 'arguments':f"U{row['UserID']},M{row['MovieID']},{rating}"}, ignore_index=True)
        return 1

def write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(data_frame, file):
    rating_number = 0
    for index,row in data_frame.iterrows():
        if rating_number < number_of_ratings:
            rating = row['Rating']
            rating_obtained = write_rating_predicate_to_file(rating, row, file)
            rating_number += rating_obtained

print('Getting ratings ground atoms...')
write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(ratings_data_frame, file)
print('Done!')

Getting ratings ground atoms...
Done!


### 5) Get Movie Ground Atoms

In [68]:
print('Getting movie ground atoms...')
write_movie_ground_atoms_to_file(movie_data_frame, file)
print('Done!')

Getting movie ground atoms...
Done!


### 6) Get User Ground Atoms

In [69]:
def write_user_gender_predicate_to_file(gender, userID, file):
    global databaseObj
    male = "M"
    female = "F"
    if alternative_predicate_types:
        if gender == male:
            file.write(f"Gender(U{userID},Male)\n")
            databaseObj = databaseObj.append({'predicate':'Gender', 'arguments':f"U{userID},Male"}, ignore_index=True)
        elif gender == female:
            file.write(f"Gender(U{userID},Female)\n")
            databaseObj = databaseObj.append({'predicate':'Gender', 'arguments':f"U{userID},Female"}, ignore_index=True)
    else:
        if gender == male:
            file.write(f"Male(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'Male', 'arguments':f"U{userID}"}, ignore_index=True)
        elif gender == female:
            file.write(f"Female(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'Female', 'arguments':f"U{userID}"}, ignore_index=True)
            
def write_user_age_predicate_to_file(age, userID, file):
    global databaseObj
    if alternative_predicate_types:
        if num_age_categories == 3:
            if age in [1, 18, 25]:
                file.write(f"Age(U{userID},Youthful)\n")
                databaseObj = databaseObj.append({'predicate':'Age', 'arguments':f"U{userID},Youthful"}, ignore_index=True)  
            elif age in [35, 45, 50]:
                file.write(f"Age(U{userID},MiddleAged)\n")
                databaseObj = databaseObj.append({'predicate':'Age', 'arguments':f"U{userID},MiddleAged"}, ignore_index=True)  
            elif age == 56:
                file.write(f"Age(U{userID},Old)\n")
                databaseObj = databaseObj.append({'predicate':'Age', 'arguments':f"U{userID},Old"}, ignore_index=True)  
        elif num_age_categories == 2:
            if age in [1, 18, 25, 35]:
                file.write(f"Age(U{userID},Young)\n")
                databaseObj = databaseObj.append({'predicate':'Age', 'arguments':f"U{userID},Young"}, ignore_index=True)  
            elif age in [45, 50, 56]:
                file.write(f"Age(U{userID},Old)\n")
                databaseObj = databaseObj.append({'predicate':'Age', 'arguments':f"U{userID},Old"}, ignore_index=True)  
        else:
            raise ValueError('number_of_age_categories ({}) not supported'.format(num_age_categories))
    else:
        if num_age_categories == 3:
            if age in [1, 18, 25]:
                file.write(f"Youthful(U{userID})\n")
                databaseObj = databaseObj.append({'predicate':'Youthful', 'arguments':f"U{userID}"}, ignore_index=True)  
            elif age in [35, 45, 50]:
                file.write(f"MiddleAged(U{userID})\n")
                databaseObj = databaseObj.append({'predicate':'MiddleAged', 'arguments':f"U{userID}"}, ignore_index=True)  
            elif age == 56:
                file.write(f"Old(U{userID})\n")
                databaseObj = databaseObj.append({'predicate':'Old', 'arguments':f"U{userID}"}, ignore_index=True)  
        elif num_age_categories == 2:
            if age in [1, 18, 25, 35]:
                file.write(f"Young(U{userID})\n")
                databaseObj = databaseObj.append({'predicate':'Young', 'arguments':f"U{userID}"}, ignore_index=True)  
            elif age in [45, 50, 56]:
                file.write(f"Old(U{userID})\n")
                databaseObj = databaseObj.append({'predicate':'Old', 'arguments':f"U{userID}"}, ignore_index=True)  
        else:
            raise ValueError('number_of_age_categories ({}) not supported'.format(num_age_categories))

def write_user_ground_atoms_to_file(data_frame, file):
    for index,row in data_frame.iterrows():
        gender = row['Gender']
        age = row['Age']
        userID = row['UserID']
        if userID in users_rating_movies:
            write_user_gender_predicate_to_file(gender, userID, file)
            write_user_age_predicate_to_file(age, userID, file)

print('Getting user ground atoms...')
write_user_ground_atoms_to_file(user_data_frame, file)
print('Done!')

Getting user ground atoms...
Done!


### 7) Preview Database

In [70]:
display(databaseObj)

Unnamed: 0,predicate,arguments
0,Rating,"U3529,M1997,Negative"
1,Rating,"U136,M3081,Positive"
2,Rating,"U1943,M1189,Positive"
3,Rating,"U3768,M2459,Negative"
4,Rating,"U208,M1327,Negative"
...,...,...
2599,Age,"U4651,Old"
2600,Gender,"U855,Female"
2601,Age,"U855,Young"
2602,Gender,"U5711,Male"


### 8) Structure Learning

In [97]:
save_file_name = database_file_name.rstrip('.db')
info_file = 'MovieLensMini.info'
type_file = 'MovieLensMini.type'
LSM_DIR = '/home/domphillips/MarkovLogic/lsmcode'
DATA_DIR = '/home/domphillips/MarkovLogic/MarkovLogicProject/MovieLens'
MLN_DIR = '/home/domphillips/MarkovLogic/MarkovLogicProject/MovieLens/MLNs'

def structure_learn():
    structure_learn_MLN_command = f"./learnMLN.sh {save_file_name} {database_file_name} {info_file} {type_file} {LSM_DIR} {DATA_DIR} {MLN_DIR}"
    subprocess.call(structure_learn_MLN_command,shell=True)

def print_MLN(log_file_name):
    with open(os.path.join(os.getcwd(), log_file_name), "a") as log_file:
        log_file.write(f'================================\n')
        log_file.write(f'------------ CONFIG ------------\n')
        log_file.write(f'num_age_categories : {num_age_categories}\n')
        log_file.write(f'num_rating_categories : {num_rating_categories}\n')
        log_file.write(f'restrict_genres : {restrict_genres}\n')
        log_file.write(f'restricted_genres : {restricted_genres}\n')
        log_file.write(f'alternative_predicate_types : {alternative_predicate_types}\n')
        log_file.write(f'number_of_ratings : {number_of_ratings}\n')
        log_file.write('\n')
        log_file.write(f'database size: {len(databaseObj.index)}\n')
        log_file.write('\n')
        log_file.write(f'------- DATABASE SNIPPIT ------\n')
        databaseObj.sample(n=20).to_csv(path_or_buf = log_file)
        log_file.write('-------------- MLN -------------\n')
        with open(os.path.join(MLN_DIR, f"{save_file_name}-rules-out.mln"), "r") as f:
            for line in f.readlines():
                line = line[0:-1]
                line_frag = line.split(' ')
                try:
                    float(line_frag[0])
                    log_file.write(line+'\n')
                except:
                    pass
            
print('Structure learning...')
structure_learn()
print('Done!')

Structure learning...
Done!


# Complete Pipeline

In [99]:
import itertools
import sys

log_file = 'MovieLens_StructureLearning.log'

database_file_name = 'MovieLensMini'
info_file = 'MovieLensMini.info'
type_file = 'MovieLensMini.type'
database_file = os.path.join(THIS_FOLDER, database_file_name)

num_age_categories_list =  [2,3]                                               
num_rating_categories_list = [2,3]                                               
restrict_genres_list = [True, False]                                                  
alternative_predicate_types_list = [True, False]                                                                       
number_of_ratings_list = [10,100,1000,3000,10000]  

hyperparameters = itertools.product(num_age_categories_list, num_rating_categories_list, restrict_genres_list, alternative_predicate_types_list, number_of_ratings_list)
number_of_params = len(num_age_categories_list)*len(num_rating_categories_list)*len(restrict_genres_list)*len(alternative_predicate_types_list)*len(number_of_ratings_list)

with tqdm(total = number_of_params, file=sys.stdout) as pbar:
    databaseObj = pd.DataFrame(columns=['predicate', 'arguments'])
    for hyperparameter_setting in hyperparameters:
        
        banned_movies = set()
        rated_movies = []
        users_rating_movies = []
        file = open(database_file, 'w')
        
        num_age_categories = hyperparameter_setting[0]
        num_rating_categories = hyperparameter_setting[1]    
        restrict_genres = hyperparameter_setting[2]    
        alternative_predicate_types = hyperparameter_setting[3]    
        number_of_ratings = hyperparameter_setting[4]  
        
        populate_banned_movies(movie_data_frame)
        write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(ratings_data_frame, file)
        write_movie_ground_atoms_to_file(movie_data_frame, file)
        write_user_ground_atoms_to_file(user_data_frame, file)
        structure_learn()
        print_MLN(log_file)
        
        pbar.update(1)

  4%|▍         | 3/80 [04:03<1:44:05, 81.11s/it]


KeyboardInterrupt: 