In [21]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [22]:
movies = pd.read_csv('datasets/ml-latest-small/movies.csv')
genres = movies['genres'].str.split('|', expand=True)

# One-hot encoding for genres
unique_genres = genres.stack().unique()

# create a column for each genre with true or false
genres = pd.get_dummies(genres, prefix='', prefix_sep='').groupby(level=0).sum()

# extract date from title (from the last bracket)
movies['year'] = movies['title'].str.extract('.*\\((.*)\\).*', expand=True)
movies['year'] = pd.to_numeric(movies['year'], errors='coerce')

movies['year'] = movies['year'].apply(lambda x: 0 if np.isnan(x) else x)

# binarize date column
date = pd.DataFrame()
date['movieId'] = movies['movieId']
date = date.set_index('movieId')

date['NaN'] = movies['year'].astype(int).apply(lambda x: x == 0)
date['1900-1920'] = movies['year'].astype(int).apply(lambda x: x >= 1900 and x < 1920)
date['1920-1940'] = movies['year'].astype(int).apply(lambda x: x >= 1920 and x < 1940)
date['1940-1960'] = movies['year'].astype(int).apply(lambda x: x >= 1940 and x < 1960)
date['1960-1980'] = movies['year'].astype(int).apply(lambda x: x >= 1960 and x < 1980)
date['1980-2000'] = movies['year'].astype(int).apply(lambda x: x >= 1980 and x < 2000)
date['2020-2040'] = movies['year'].astype(int).apply(lambda x: x >= 2020)

movies = movies.drop('genres', axis=1).join(genres)
movies = movies.drop('title', axis=1)

In [23]:
ratings = pd.read_csv('datasets/ml-latest-small/ratings.csv')

# generate time of day from timestamp
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

ratings['hour'] = ratings['timestamp'].dt.hour
ratings.drop('timestamp', axis=1, inplace=True)

# one-hot from hour column
ratings = pd.get_dummies(ratings, prefix='', prefix_sep='').groupby(level=0).sum()

In [24]:
#combine movies with ratings on movieId
df = pd.merge(movies, ratings, on='movieId')
df = pd.merge(df, date, on='movieId')

# one hot encode the ratings
df['Low Score'] = np.where(df['rating'] <= 2, True, False)
df['Medium Score'] = np.where((df['rating'] > 2) & (df['rating'] < 4), True, False)
df['High Score'] = np.where(df['rating'] >= 4, True, False)

# one hot encode the hours
df['Morning'] = np.where((df['hour'] >= 6) & (df['hour'] < 12), True, False)
df['Afternoon'] = np.where((df['hour'] >= 12) & (df['hour'] < 18), True, False)
df['Evening'] = np.where((df['hour'] >= 18) & (df['hour'] < 24), True, False)
df['Night'] = np.where((df['hour'] >= 0) & (df['hour'] < 6), True, False)

#drop useless columns
df.drop('userId', axis=1, inplace=True)
df.drop('hour', axis=1, inplace=True)
df.drop('rating', axis=1, inplace=True)
df.drop('movieId', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)

# Create frequent itemsets using the apriori algorithm
df.dropna(inplace=True)
df.fillna(0, inplace=True)
df = df.astype(bool)

#drop rows where df['NaN'] is true
df = df[df['NaN'] == False]
df.drop('NaN', axis=1, inplace=True)

frequent_itemsets = apriori(df, min_support=0.024, use_colnames=True)

# create the rules using support metric
rules = association_rules(frequent_itemsets)

# order the rules by antecedent support descending
rules = rules.sort_values('antecedent support', ascending=False)

# output csv with rules
rules.to_csv('rules.csv', sep='\t', index=False)
print(rules)

                             antecedents           consequents  \
0                            (Adventure)              (Action)   
25                (1980-2000, Adventure)              (Action)   
27               (High Score, Adventure)              (Action)   
16                               (Drama)             (Romance)   
42                               (Drama)   (1980-2000, Comedy)   
..                                   ...                   ...   
71                    (1980-2000, Drama)               (Crime)   
94                    (1980-2000, Drama)  (High Score, Comedy)   
40                    (1980-2000, Drama)              (Comedy)   
24                 (Adventure, Thriller)              (Action)   
90  (1980-2000, Adventure, Medium Score)              (Action)   

    antecedent support  consequent support   support  confidence       lift  \
0             0.128911            0.285824  0.128911    1.000000   3.498653   
25            0.071058            0.285824  0.071