In [106]:
import pandas as pd
import json
import gc
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [107]:
pd.options.display.max_columns=100

## 1.读取数据

In [108]:
# 读入元数据
movies_metadata = pd.read_csv("../data/movies_metadata.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [109]:
# 只要 id 标题 题材（原始数据）
movies = movies_metadata[{'id', 'title', 'genres'}]

# 回收metadata
del movies_metadata
gc.collect()

movies

Unnamed: 0,title,id,genres
0,Toy Story,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,15602,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,31357,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,11862,"[{'id': 35, 'name': 'Comedy'}]"
...,...,...,...
45461,Subdue,439050,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n..."
45462,Century of Birthing,111109,"[{'id': 18, 'name': 'Drama'}]"
45463,Betrayal,67758,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam..."
45464,Satan Triumphant,227506,[]


## 制作数据集

In [110]:
# gpt-4编写的字符串处理函数
# 转换体裁

def genres2genre(str):
    # Since the input string uses single quotes, we need to replace them with double quotes for valid JSON format
    json_string = str.replace("'", '"')

    # Load the string as a JSON object (list of dictionaries)
    data = json.loads(json_string)

    # Extract the 'name' key from each dictionary and join them with '|'
    result = '|'.join(d['name'] for d in data)
    return result

In [111]:
# 将genres转换成容易处理的形式

movies['genre'] = movies['genres'].apply(genres2genre)
movies.drop(columns='genres', inplace=True)
movies

Unnamed: 0,title,id,genre
0,Toy Story,862,Animation|Comedy|Family
1,Jumanji,8844,Adventure|Fantasy|Family
2,Grumpier Old Men,15602,Romance|Comedy
3,Waiting to Exhale,31357,Comedy|Drama|Romance
4,Father of the Bride Part II,11862,Comedy
...,...,...,...
45461,Subdue,439050,Drama|Family
45462,Century of Birthing,111109,Drama
45463,Betrayal,67758,Action|Drama|Thriller
45464,Satan Triumphant,227506,


In [112]:
# 队电影题材进行ont-hot编码
movies = movies.join(movies.genre.str.get_dummies())
movies.drop(columns='genre', inplace=True)
movies

Unnamed: 0,title,id,Action,Adventure,Animation,Aniplex,BROSTA TV,Carousel Productions,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,GoHands,History,Horror,Mardock Scramble Production Committee,Music,Mystery,Odyssey Media,Pulser Productions,Rogue State,Romance,Science Fiction,Sentai Filmworks,TV Movie,Telescene Film Group Productions,The Cartel,Thriller,Vision View Entertainment,War,Western
0,Toy Story,862,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Jumanji,8844,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,15602,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Waiting to Exhale,31357,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Father of the Bride Part II,11862,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,Subdue,439050,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
45462,Century of Birthing,111109,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
45463,Betrayal,67758,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
45464,Satan Triumphant,227506,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 关联分析

In [113]:
# 获取频繁项集
frequent_itemsets_movies = apriori(movies.drop(columns={'title', 'id'}), use_colnames=True, min_support=0.01)

In [114]:
frequent_itemsets_movies

Unnamed: 0,support,itemsets
0,0.145075,(Action)
1,0.076893,(Adventure)
2,0.042559,(Animation)
3,0.289931,(Comedy)
4,0.094730,(Crime)
...,...,...
70,0.016870,"(Action, Crime, Thriller)"
71,0.019157,"(Action, Drama, Thriller)"
72,0.030836,"(Drama, Romance, Comedy)"
73,0.025821,"(Drama, Crime, Thriller)"


In [117]:
# 获取规则
rules_movies = association_rules(frequent_itemsets_movies, metric='lift', min_threshold=1.25)

In [118]:
rules_movies

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Action),(Adventure),0.145075,0.076893,0.038116,0.262735,3.416908,0.026961,1.252070,0.827369
1,(Adventure),(Action),0.076893,0.145075,0.038116,0.495709,3.416908,0.026961,1.695301,0.766257
2,(Action),(Crime),0.145075,0.094730,0.030088,0.207398,2.189361,0.016345,1.142150,0.635431
3,(Crime),(Action),0.094730,0.145075,0.030088,0.317622,2.189361,0.016345,1.252862,0.600093
4,(Action),(Fantasy),0.145075,0.050873,0.011019,0.075955,1.493029,0.003639,1.027144,0.386257
...,...,...,...,...,...,...,...,...,...,...
77,(Thriller),"(Drama, Crime)",0.167686,0.055536,0.025821,0.153987,2.772749,0.016509,1.116371,0.768156
78,"(Drama, Thriller)",(Mystery),0.075375,0.054260,0.015594,0.206886,3.812850,0.011504,1.192439,0.797868
79,"(Drama, Mystery)",(Thriller),0.025887,0.167686,0.015594,0.602379,3.592309,0.011253,2.093235,0.740805
80,(Thriller),"(Drama, Mystery)",0.167686,0.025887,0.015594,0.092996,3.592309,0.011253,1.073989,0.867013


In [125]:
# 选取提升都大于3的电影
rules_movies_lift3 = rules_movies[rules_movies['lift'] > 3].sort_values('lift', ascending=False)
rules_movies_lift3

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
19,(Family),(Animation),0.060925,0.042559,0.018849,0.309386,7.269538,0.016256,1.386362,0.918392
18,(Animation),(Family),0.042559,0.060925,0.018849,0.442894,7.269538,0.016256,1.685632,0.900776
38,(Fantasy),(Family),0.050873,0.060925,0.013483,0.265024,4.350026,0.010383,1.277695,0.811395
39,(Family),(Fantasy),0.060925,0.050873,0.013483,0.2213,4.350026,0.010383,1.21886,0.820079
15,(Fantasy),(Adventure),0.050873,0.076893,0.015,0.294855,3.834635,0.011088,1.309103,0.778841
14,(Adventure),(Fantasy),0.076893,0.050873,0.015,0.19508,3.834635,0.011088,1.179157,0.800794
81,(Mystery),"(Drama, Thriller)",0.05426,0.075375,0.015594,0.287394,3.81285,0.011504,1.297526,0.780055
78,"(Drama, Thriller)",(Mystery),0.075375,0.05426,0.015594,0.206886,3.81285,0.011504,1.192439,0.797868
12,(Adventure),(Family),0.076893,0.060925,0.017244,0.224256,3.68088,0.012559,1.210548,0.788994
13,(Family),(Adventure),0.060925,0.076893,0.017244,0.283032,3.68088,0.012559,1.287516,0.775578


## 保存数据

In [69]:
frequent_itemsets_movies.to_csv('../data/frequent_itemsets_movies.csv', index=False)
rules_movies_lift3.to_csv('../data/rules_movies_lift3.csv', index=False)