# MovieLens Database

ReadME notes here

In [2]:
import os
import pandas as pd
from tqdm import tqdm
import subprocess

NUMBER_USERS = 6040
NUMBER_MOVIES = 3952
THIS_FOLDER = os.getcwd()

### 1) Import Dataframes

In [3]:
def read_movies_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['MovieID', 'MovieTitle', 'MovieGenres'], engine='python')
    return DataFrame 

def read_users_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode'], engine='python')
    return DataFrame

def read_ratings_file(file_name):
    DataFrame = pd.read_csv(file_name, delimiter='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
    return DataFrame 

def import_data(directory):
    movies_file = os.path.join(directory, 'movies.dat')
    users_file = os.path.join(directory, 'users.dat')
    ratings_file = os.path.join(directory, 'ratings.dat')

    movie_data_frame = read_movies_file(movies_file)
    user_data_frame = read_users_file(users_file)
    ratings_data_frame = read_ratings_file(ratings_file)

    return movie_data_frame, user_data_frame, ratings_data_frame

def shuffle_data_frame(data_frame, seed):
    pass

print('Importing files...')
movie_data_frame, user_data_frame, ratings_data_frame = import_data(THIS_FOLDER)

#Shuffle the dataframes
movie_data_frame = movie_data_frame.sample(frac=1)
user_data_frame = user_data_frame.sample(frac=1)
ratings_data_frame = ratings_data_frame.sample(frac=1)
print('Done!')

Importing files...
Done!


### 2) Settings

In [4]:
num_age_categories =  2                                                 #either 2 or 3
num_rating_categories = 2                                               #either 2 or 3
restrict_genres = True                                                  #if True then only allow movies of restricted genre
restricted_genres = ['Romance', 'Horror', 'Animation', 'Documentary']   
alternative_predicate_types = True                                      #if True then Gender(user, gender) instead of Male(user), Female(user) etc
                                                                        #similarly for user age and for movie genre
number_of_ratings = 1000                                                #integer less than 1000210
database_file_name = 'MovieLensMini.db'
databaseObj = pd.DataFrame(columns=['predicate', 'arguments'])

database_file = os.path.join(THIS_FOLDER, database_file_name)

### 3) Populate Banned Movies (if restricting genres)

In [5]:
def write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output):
    global databaseObj
    if not suppress_output and movieID in rated_movies:
        file.write(f"{genre}(M{movieID})\n")
        databaseObj = databaseObj.append({'predicate':genre, 'arguments':f"M{movieID}"}, ignore_index=True)

def write_movie_to_file(row, file, suppress_output):
    movies_added = 0
    movieID = row['MovieID']
    genres = row['MovieGenres'].split('|')
    for genre in genres:
        if restrict_genres:
            if genre in restricted_genres:
                write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output)
                movies_added = 1
            else:
                banned_movies.add(movieID)
        else:
            write_movie_genre_predicate_to_file(genre, movieID, file, suppress_output)
    
    return movies_added

def write_movie_ground_atoms_to_file(data_frame, file, suppress_output = False):
    number_of_movies = 0
    for index,row in data_frame.iterrows():
        movies_added = write_movie_to_file(row, file, suppress_output)
        number_of_movies += movies_added
                
def populate_banned_movies(data_frame):
    write_movie_ground_atoms_to_file(data_frame, None, suppress_output = True)
    

print('Constructing banned movie set...')
banned_movies = set()
populate_banned_movies(movie_data_frame)
print('Done!')

Constructing banned movie set...
Done!


### 4) Getting Ratings Ground Atoms

In [6]:
rated_movies = []
users_rating_movies = []
file = open(database_file, 'w')

def write_rating_predicate_to_file(rating, row, file):
    global databaseObj
    if num_rating_categories == 3:
        if rating >=4:
            rating = 'Likes'
        elif rating == 3:
            rating = 'Indifferent'
        elif rating <= 2:
            rating = 'Dislikes'
    elif num_rating_categories == 2:
        if rating >=4:
            rating = 'Likes'
        else:
            rating = 'Dislikes'
    else:
        raise ValueError('number_of_rating_categories ({}) not supported'.format(num_rating_categories))
    if restrict_genres:
        if row['MovieID'] not in banned_movies:
            rated_movies.append(row['MovieID'])
            users_rating_movies.append(row['UserID'])
            file.write(f"{rating}(U{row['UserID']},M{row['MovieID']})\n")
            databaseObj = databaseObj.append({'predicate':rating, 'arguments':f"U{row['UserID']},M{row['MovieID']}"}, ignore_index=True)
            return 1
        else:
            return 0
    else:
        rated_movies.append(row['MovieID'])
        users_rating_movies.append(row['UserID'])
        file.write(f"{rating}(U{row['UserID']},M{row['MovieID']})\n")
        databaseObj = databaseObj.append({'predicate':rating, 'arguments':f"U{row['UserID']},M{row['MovieID']}"}, ignore_index=True)
        return 1

def write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(data_frame, file):
    rating_number = 0
    for index,row in data_frame.iterrows():
        if rating_number < number_of_ratings:
            rating = row['Rating']
            rating_obtained = write_rating_predicate_to_file(rating, row, file)
            rating_number += rating_obtained
        else:
            break

print('Getting ratings ground atoms...')
write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(ratings_data_frame, file)
print('Done!')

Getting ratings ground atoms...
Done!


### 5) Get Movie Ground Atoms

In [7]:
print('Getting movie ground atoms...')
write_movie_ground_atoms_to_file(movie_data_frame, file)
print('Done!')

Getting movie ground atoms...
Done!


### 6) Get User Ground Atoms

In [8]:
def write_user_gender_predicate_to_file(gender, userID, file):
    global databaseObj
    male = "M"
    female = "F"
    if gender == male:
        file.write(f"Male(U{userID})\n")
        databaseObj = databaseObj.append({'predicate':'Male', 'arguments':f"U{userID}"}, ignore_index=True)
    elif gender == female:
        pass
        #file.write(f"Female(U{userID})\n")
        #databaseObj = databaseObj.append({'predicate':'Female', 'arguments':f"U{userID}"}, ignore_index=True)
            
def write_user_age_predicate_to_file(age, userID, file):
    global databaseObj
    if num_age_categories == 3:
        if age in [1, 18, 25]:
            file.write(f"Young(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'Youthful', 'arguments':f"U{userID}"}, ignore_index=True)  
        elif age in [35, 45, 50]:
            file.write(f"MiddleAged(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'MiddleAged', 'arguments':f"U{userID}"}, ignore_index=True)  
        elif age == 56:
            file.write(f"Old(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'Old', 'arguments':f"U{userID}"}, ignore_index=True)  
    elif num_age_categories == 2:
        if age in [1, 18, 25, 35]:
            file.write(f"Young(U{userID})\n")
            databaseObj = databaseObj.append({'predicate':'Young', 'arguments':f"U{userID}"}, ignore_index=True)  
        elif age in [45, 50, 56]:
            pass
            #file.write(f"Old(U{userID})\n")
            #databaseObj = databaseObj.append({'predicate':'Old', 'arguments':f"U{userID}"}, ignore_index=True)  
    else:
        raise ValueError('number_of_age_categories ({}) not supported'.format(num_age_categories))

def write_user_ground_atoms_to_file(data_frame, file):
    for index,row in data_frame.iterrows():
        gender = row['Gender']
        age = row['Age']
        userID = row['UserID']
        if userID in users_rating_movies:
            write_user_gender_predicate_to_file(gender, userID, file)
            write_user_age_predicate_to_file(age, userID, file)

print('Getting user ground atoms...')
write_user_ground_atoms_to_file(user_data_frame, file)
print('Done!')

Getting user ground atoms...
Done!


### 7) Preview Database

In [9]:
display(databaseObj)

Unnamed: 0,predicate,arguments
0,Dislikes,"U3280,M1191"
1,Dislikes,"U4169,M1666"
2,Likes,"U5551,M1345"
3,Likes,"U1611,M1975"
4,Dislikes,"U278,M1971"
...,...,...
2379,Male,U839
2380,Young,U839
2381,Male,U4067
2382,Male,U173


### 8) Structure Learning

In [10]:
save_file_name = database_file_name.rstrip('.db')
LSM_DIR = '/home/domphillips/MarkovLogic/lsmcode'
DATA_DIR = '/home/domphillips/MarkovLogic/MarkovLogicProject/MovieLens'
MLN_DIR = '/home/domphillips/MarkovLogic/MarkovLogicProject/MovieLens/MLNs'

def structure_learn(info_file, type_file):
    structure_learn_MLN_command = f"./learnMLN.sh {save_file_name} {database_file_name} {info_file} {type_file} {LSM_DIR} {DATA_DIR} {MLN_DIR}"
    subprocess.call(structure_learn_MLN_command,shell=True)

def print_MLN(log_file_name, info_file, type_file):
    with open(os.path.join(os.getcwd(), log_file_name), "a") as log_file:
        log_file.write(f'============================================================================================\n')
        log_file.write(f'------------ CONFIG ------------\n')
        log_file.write(f'num_age_categories : {num_age_categories}\n')
        log_file.write(f'num_rating_categories : {num_rating_categories}\n')
        log_file.write(f'restrict_genres : {restrict_genres}\n')
        if restrict_genres:
            log_file.write(f'restricted_genres : {restricted_genres}\n')
        log_file.write(f'remove_redundant_binary_predicate : True\n')
        log_file.write(f'number_of_ratings : {number_of_ratings}\n')
        log_file.write('\n')
        log_file.write(f'database size: {len(databaseObj.index)}\n')
        log_file.write(f'------------ TYPES -------------\n')
        with open(os.path.join(os.getcwd(),type_file), "r") as t_file:
            for line in t_file.readlines():
                log_file.write(line)
        log_file.write('\n')
        log_file.write(f'------------  INFO -------------\n')
        with open(os.path.join(os.getcwd(),info_file), "r") as i_file:
            for line in i_file.readlines():
                log_file.write(line)
        log_file.write('\n')
        log_file.write('\n')
        log_file.write(f'------- DATABASE SNIPPIT ------\n')
        databaseObj.sample(n=10).to_csv(path_or_buf = log_file)
        log_file.write('-------------- MLN -------------\n')
        with open(os.path.join(MLN_DIR, f"{save_file_name}-rules-out.mln"), "r") as f:
            for line in f.readlines():
                line = line[0:-1]
                line_frag = line.split(' ')
                try:
                    float(line_frag[0])
                    log_file.write(line+'\n')
                except:
                    pass
            
print('Structure learning...')
structure_learn(info_file, type_file)
print('Done!')

Structure learning...


NameError: name 'info_file' is not defined

# Complete Pipeline

In [13]:
import itertools
import sys

def get_info_and_type_files(restrict_genres, number_age_categories, number_rating_categories, remove_redundant_binary_predicate):
    type_file = 'MovieLens.type'
    if restrict_genres:
        if number_age_categories == 2:
            if number_rating_categories == 2:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_rg_age2_rating2_nonbi.info'
                else:
                    info_file = 'MovieLens_rg_age2_rating2.info'
            elif number_rating_categories == 3:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_rg_age2_rating3_nonbi.info'
                else:
                    info_file = 'MovieLens_rg_age2_rating3.info'
            else:
                raise ValueError('number of rating categories must be either 2 or 3')
        elif number_age_categories == 3:
            if number_rating_categories == 2:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_rg_age3_rating2_nonbi.info'
                else:
                    info_file = 'MovieLens_rg_age3_rating2.info'
            elif number_rating_categories == 3:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_rg_age3_rating3_nonbi.info'
                else:
                    info_file = 'MovieLens_rg_age3_rating3.info'
            else:
                raise ValueError('number of rating categories must be either 2 or 3')
        else:
            raise ValueError('number of age categories must be either 2 or 3')
    else:
        if number_age_categories == 2:
            if number_rating_categories == 2:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_age2_rating2_nonbi.info'
                else:
                    info_file = 'MovieLens_age2_rating2.info'
            elif number_rating_categories == 3:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_age2_rating3_nonbi.info'
                else:
                    info_file = 'MovieLens_age2_rating3.info'
            else:
                raise ValueError('number of rating categories must be either 2 or 3')
        elif number_age_categories == 3:
            if number_rating_categories == 2:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_age3_rating2_nonbi.info'
                else:
                    info_file = 'MovieLens_age3_rating2.info'
            elif number_rating_categories == 3:
                if remove_redundant_binary_predicate:
                    info_file = 'MovieLens_age3_rating3_nonbi.info'
                else:
                    info_file = 'MovieLens_age3_rating3.info'
            else:
                raise ValueError('number of rating categories must be either 2 or 3')
        else:
            raise ValueError('number of age categories must be either 2 or 3')
            file
    return info_file, type_file

log_file = 'MovieLens_StructureLearning.log'

database_file_name = 'MovieLens.db'
database_file = os.path.join(THIS_FOLDER, database_file_name)
info_file_directory = '/InfoFiles'

num_age_categories_list =  [2]                                               
num_rating_categories_list = [2]                                               
restrict_genres_list = [False]                                                                                                                        
number_of_ratings_list = [1000,3000,10000]  

hyperparameters = itertools.product(number_of_ratings_list, num_age_categories_list, num_rating_categories_list, restrict_genres_list)
number_of_params = len(num_age_categories_list)*len(num_rating_categories_list)*len(restrict_genres_list)*len(number_of_ratings_list)

with tqdm(total = number_of_params, file=sys.stdout) as pbar:
    for hyperparameter_setting in hyperparameters:
        with open(database_file, 'w') as file:
            databaseObj = pd.DataFrame(columns=['predicate', 'arguments'])

            banned_movies = set()
            rated_movies = []
            users_rating_movies = []

            number_of_ratings = hyperparameter_setting[0]  
            num_age_categories = hyperparameter_setting[1]
            num_rating_categories = hyperparameter_setting[2]    
            restrict_genres = hyperparameter_setting[3]    

            info_file, type_file = get_info_and_type_files(restrict_genres, num_age_categories, num_rating_categories, True)
            info_file = os.path.join(info_file_directory, info_file)
            type_file = os.path.join(info_file_directory, type_file)
            
            populate_banned_movies(movie_data_frame)
            pbar.update(0.2)
            write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(ratings_data_frame, file)
            pbar.update(0.2)
            write_movie_ground_atoms_to_file(movie_data_frame, file)
            pbar.update(0.2)
            write_user_ground_atoms_to_file(user_data_frame, file)
            pbar.update(0.2)
        structure_learn(info_file, type_file)
        print_MLN(log_file, info_file, type_file)
        pbar.update(0.2)
    

 27%|██▋       | 0.8/3 [4:04:58<11:13:41, 18373.51s/it]       


FileNotFoundError: [Errno 2] No such file or directory: '/InfoFiles/MovieLens.type'

In [135]:
databaseObj = pd.DataFrame(columns=['predicate', 'arguments'])

banned_movies = set()
rated_movies = []
users_rating_movies = []
with open(database_file, 'w') as file:
    num_age_categories = 3
    num_rating_categories = 3   
    restrict_genres = False  
    alternative_predicate_types = False  
    number_of_ratings = 250

    info_file, type_file = get_info_and_type_files(restrict_genres, num_age_categories, alternative_predicate_types)

    populate_banned_movies(movie_data_frame)
    write_ratings_ground_atoms_to_file_and_populate_rated_movies_and_users_rating_movies_arrays(ratings_data_frame, file)
    write_movie_ground_atoms_to_file(movie_data_frame, file)
    write_user_ground_atoms_to_file(user_data_frame, file)
    display(databaseObj)
    structure_learn(info_file, type_file)
    print_MLN(log_file, info_file, type_file)

Unnamed: 0,predicate,arguments
0,Rating,"U551,M1748,Positive"
1,Rating,"U4789,M110,Indifferent"
2,Rating,"U4258,M517,Indifferent"
3,Rating,"U4819,M2794,Indifferent"
4,Rating,"U3605,M2502,Positive"
...,...,...
1193,Youthful,U5530
1194,Male,U2225
1195,Youthful,U2225
1196,Male,U869


movie

user

rating

what


MovieLens.info
movie

user

rating



In [27]:
structure_learn(info_file, type_file)
print_MLN(log_file, info_file, type_file)

### Inference On The MLN

In [15]:
mln_file_name = 'MovieLens500.mln'
database_file = 'MovieLens500.db'
results_file = 'MovieLens500Inference'
INFER_DIR = '/home/domphillips/MarkovLogic/alchemy-2/bin'
query_predicates = 'Likes;Dislikes'

inference_command = f'{INFER_DIR}/infer -i {mln_file_name} -r {results_file} -e {database_file} -q {query_predicates}'
subprocess.call(inference_command,shell=True)

127