In [167]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [168]:
movies = pd.read_csv('datasets/ml-latest-small/movies.csv')
genres = movies['genres'].str.split('|', expand=True)

# One-hot encoding for genres
unique_genres = genres.stack().unique()

# create a column for each genre with true or false
genres = pd.get_dummies(genres, prefix='', prefix_sep='').groupby(level=0).sum()

# extract date from title (from the last bracket)
movies['year'] = movies['title'].str.extract('.*\\((.*)\\).*', expand=True)
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

movies['year'] = movies['year'].apply(lambda x: 0 if np.isnan(x) else x)

# binarize date column
date = pd.DataFrame()
date['movieId'] = movies['movieId']
date = date.set_index('movieId')

date['NaN'] = movies['year'].astype(int).apply(lambda x: x == 0)
date['1900-1920'] = movies['year'].astype(int).apply(lambda x: x >= 1900 and x < 1920)
date['1920-1940'] = movies['year'].astype(int).apply(lambda x: x >= 1920 and x < 1940)
date['1940-1960'] = movies['year'].astype(int).apply(lambda x: x >= 1940 and x < 1960)
date['1960-1980'] = movies['year'].astype(int).apply(lambda x: x >= 1960 and x < 1980)
date['1980-2000'] = movies['year'].astype(int).apply(lambda x: x >= 1980 and x < 2000)
date['2020-2040'] = movies['year'].astype(int).apply(lambda x: x >= 2020)

movies = movies.drop('genres', axis=1).join(genres)
movies = movies.drop('title', axis=1)

In [169]:
ratings = pd.read_csv('datasets/ml-latest-small/ratings.csv')

# generate time of day from timestamp
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

ratings['hour'] = ratings['timestamp'].dt.hour
ratings.drop('timestamp', axis=1, inplace=True)

# one-hot from hour column
ratings = pd.get_dummies(ratings, prefix='', prefix_sep='').groupby(level=0).sum()

In [170]:
#combine movies with ratings on movieId
df = pd.merge(movies, ratings, on='movieId')
df = pd.merge(df, date, on='movieId')

# one hot encode the ratings
df['low'] = np.where(df['rating'] <= 2, True, False)
df['medium'] = np.where((df['rating'] > 2) & (df['rating'] < 4), True, False)
df['high'] = np.where(df['rating'] >= 4, True, False)

# one hot encode the hours
df['morning'] = np.where((df['hour'] >= 6) & (df['hour'] < 12), True, False)
df['afternoon'] = np.where((df['hour'] >= 12) & (df['hour'] < 18), True, False)
df['evening'] = np.where((df['hour'] >= 18) & (df['hour'] < 24), True, False)
df['night'] = np.where((df['hour'] >= 0) & (df['hour'] < 6), True, False)

#drop hour and userId columns
df.drop('userId', axis=1, inplace=True)
df.drop('hour', axis=1, inplace=True)

#drop rating column and movieId column
df.drop('rating', axis=1, inplace=True)
df.drop('movieId', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)

# Create frequent itemsets using the apriori algorithm
df.dropna(inplace=True)
df.fillna(0, inplace=True)
df = df.astype(bool)

#drop rows where df['NaN'] is true
df = df[df['NaN'] == False]
df.drop('NaN', axis=1, inplace=True)

frequent_itemsets = apriori(df, min_support=0.024, use_colnames=True)

# create the rules using support metric
rules = association_rules(frequent_itemsets)

# order the rules by antecedent support descending
rules = rules.sort_values('antecedent support', ascending=False)

# output csv with rules
rules.to_csv('rules.csv', sep='\t', index=False)
print(rules)

                       antecedents     consequents  antecedent support  \
0                      (Adventure)        (Action)            0.128911   
25          (Adventure, 1980-2000)        (Action)            0.071058   
27               (Adventure, high)        (Action)            0.060509   
16                         (Drama)       (Romance)            0.051423   
19                         (Drama)       (evening)            0.051423   
..                             ...             ...                 ...   
71              (Drama, 1980-2000)         (Crime)            0.027104   
93              (Drama, 1980-2000)  (high, Comedy)            0.027104   
39              (Drama, 1980-2000)        (Comedy)            0.027104   
24           (Adventure, Thriller)        (Action)            0.026886   
90  (Adventure, medium, 1980-2000)        (Action)            0.026270   

    consequent support   support  confidence       lift  leverage  conviction  \
0             0.285824  0.1289