# 豆瓣电影图谱
1. 从豆瓣爬取电影与书籍信息
2. 将爬取到的json数据转化为MySQL并存储
3. 将 Mysql 数据转化为 CSV 数据
4. 将 CSV 数据导入 Neo4j

In [1]:
from pathlib import Path
import pymysql
import json
import pandas as pd

## 爬取豆瓣电影与书籍详细信息

爬虫参考 github link
+ [https://github.com/weizhixiaoyi/DouBan-Spider](https://github.com/weizhixiaoyi/DouBan-Spider)
+ [https://github.com/Jack-Cherish/python-spider](https://github.com/Jack-Cherish/python-spider)
+ [https://github.com/facert/awesome-spider](https://github.com/facert/awesome-spider)
+ [https://github.com/binux/pyspider](https://github.com/binux/pyspider)
+ [https://github.com/wistbean/learn_python3_spider](https://github.com/wistbean/learn_python3_spider)

In [2]:
root_dir = Path('../data/movie_kg/txt')
for file in root_dir.glob('*'):
    print(f'path_name:{file} \nname:\t{file.name}\n')

path_name:../data/movie_kg/txt/small_movie_person_info.txt 
name:	small_movie_person_info.txt

path_name:../data/movie_kg/txt/.ipynb_checkpoints 
name:	.ipynb_checkpoints

path_name:../data/movie_kg/txt/small_movie_info.txt 
name:	small_movie_info.txt



## Explore Data Analysis

In [3]:
count = 0
while True:
    with open(root_dir / "small_movie_info.txt", "r") as f:
        line = f.readline()
        print(line)
        if count > 3:
            break
        count += 1

{"id": "26425063", "image_url": "https://img3.doubanio.com/view/photo/s_ratio_poster/public/p2535260806.webp", "name": "无双 無雙", "directors": [{"name": "庄文强", "id": "1014716"}], "writers": [{"name": "庄文强", "id": "1014716"}], "actors": [{"name": "周润发", "id": "1044899"}, {"name": "郭富城", "id": "1041390"}, {"name": "张静初", "id": "1016668"}, {"name": "冯文娟", "id": "1325753"}, {"name": "廖启智", "id": "1274268"}, {"name": "周家怡", "id": "1321491"}, {"name": "王耀庆", "id": "1316330"}, {"name": "方中信", "id": "1033102"}, {"name": "高捷", "id": "1276076"}, {"name": "邢佳栋", "id": "1274762"}, {"name": "张松枝", "id": "1339097"}, {"name": "张建声", "id": "1323927"}, {"name": "吴嘉龙", "id": "1214684"}, {"name": "孙佳君", "id": "1050664"}], "genres": ["剧情", "动作", "犯罪"], "countries": ["中国大陆", "香港"], "languages": ["汉语普通话", "粤语", "英语", "泰语", "波兰语"], "pubdates": ["2018-09-30(中国大陆)", "2018-10-04(香港)"], "episodes": "1", "durations": ["130分钟"], "other_names": ["ProjectGutenberg", "Moseung"], "summary": "身陷囹圄的李问（郭富城饰）被引渡回港，他原本隶属于一个的

- 电影信息(**small_movie_info.txt**)包括电影id、图片链接、名称、导演名称、编剧名称、主演名称、类型、制片国家、语言、上映日期、片长、季数、集数、其他名称、剧情简介、评分、评分人数，共67245条数据信息。虽说是电影信息，但其中也包括电视剧、综艺、动漫、纪录片、短片。

- 电影演员(**small_movie_person_info.txt**)信息包括演员id、姓名、图片链接、性别、星座、出生日期、出生地、职业、更多中文名、更多外文名、-家庭成员、简介，共89592条数据信息。这里所指的演员包括电影演员、编剧、导演。

## 提取电影类别, 并存入到mysql之中
transfrom json to sql

创建 database 

```sh
mysql -u root -p

CREATE DATABASE douban_kg;
```

In [4]:
# 连接mysql数据库

kg_conn = pymysql.connect(
    host='localhost',
    port=3306,
    user='root',
    password='fengdage',
    db='movie_kg',
    cursorclass=pymysql.cursors.DictCursor
)

In [5]:
# 读取文件
movie_file_path = '../data/movie_kg/txt/small_movie_info.txt'
movie_str_list = open(movie_file_path, 'r').readlines()
movie_json_list = [json.loads(movie) for movie in movie_str_list]

In [6]:
# 电影类型
movie_genres = set()
for movie in movie_json_list:
    for genre in movie['genres']:
        movie_genres.add(genre)
movie_genres = list(movie_genres)
movie_genres.sort(key=lambda i: len(i))
print(movie_genres)

['家庭', '爱情', '音乐', '灾难', '悬疑', '喜剧', '科幻', '短片', '奇幻', '古装', '歌舞', '惊悚', '剧情', '传记', '西部', '儿童', '历史', '同性', '情色', '运动', '武侠', '犯罪', '动画', '战争', '动作', '冒险', '恐怖', '纪录片']


## 存储信息到表`movie_genere`之中

#### 创建表

In [7]:
# sql = "DROP TABLE  movie_genre;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

sql = '''
create table movie_genre
(
  movie_genre_id   int         not null
    primary key,
  movie_genre_name varchar(20) not null
);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

#### 数据插入

In [8]:
try:
    genre_id = 1
    with kg_conn.cursor() as cursor:
        # Insert Data
        for genre in movie_genres:
            sql = "INSERT INTO `movie_genre` (`movie_genre_id`, `movie_genre_name`) VALUES (%s, %s)"
            cursor.execute(sql, (genre_id, genre))
            genre_id += 1
        kg_conn.commit()
except Exception as err:
    print('movie_genres数据插入错误' + str(err))

#### 测试插入结果

In [9]:
sql = 'SELECT * FROM movie_genre LIMIT 5;'
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchall()
result

[{'movie_genre_id': 1, 'movie_genre_name': '家庭'},
 {'movie_genre_id': 2, 'movie_genre_name': '爱情'},
 {'movie_genre_id': 3, 'movie_genre_name': '音乐'},
 {'movie_genre_id': 4, 'movie_genre_name': '灾难'},
 {'movie_genre_id': 5, 'movie_genre_name': '悬疑'}]

## 存储信息到表`movie_info`之中

In [10]:
# 删除表
# sql = "DROP TABLE  movie_info;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

# 创建表
sql = '''
create table movie_info
(
  movie_info_id           int          not null
    primary key,
  movie_info_name         text         null,
  movie_info_image_url    varchar(200) null,
  movie_info_country      varchar(200) null,
  movie_info_language     varchar(200) null,
  movie_info_pubdate      varchar(200) null,
  movie_info_duration     varchar(200) null,
  movie_info_other_name   text         null,
  movie_info_summary      mediumtext   null,
  movie_info_rating       varchar(10)  null,
  movie_info_review_count varchar(200) null
);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [11]:
# 存储信息到mysql数据库之中
try:
    with kg_conn.cursor() as cursor:
        for movie in movie_json_list:
            movie_info_id = movie['id']
            movie_info_image_url = movie['image_url']
            movie_info_name = movie['name']
            movie_info_country = ','.join(movie['countries'])
            movie_info_language = ','.join(movie['languages'])
            movie_info_pubdate = ','.join(movie['pubdates'])
            movie_info_duration = ','.join(movie['durations'])
            movie_info_other_name = ','.join(movie['other_names'])
            movie_info_summary = movie['summary']
            movie_info_rating = movie['rating']['average']
            movie_info_review_count = movie['rating']['reviews_count']

            sql = "INSERT INTO `movie_info` (`movie_info_id`, `movie_info_image_url`, `movie_info_name`, `movie_info_country`," \
                    "`movie_info_language`, `movie_info_pubdate`, `movie_info_duration`, `movie_info_other_name`," \
                    "`movie_info_summary`, `movie_info_rating`, `movie_info_review_count`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"

            cursor.execute(sql, (
                movie_info_id, movie_info_image_url, movie_info_name, movie_info_country, movie_info_language,
                movie_info_pubdate, movie_info_duration, movie_info_other_name,
                movie_info_summary,
                movie_info_rating, movie_info_review_count))
        kg_conn.commit()
except Exception as err:
    print('movie_info插入数据错误' + str(err))

In [12]:
# 查看插入的电影信息
sql = 'SELECT * FROM movie_info'
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchone()
result

{'movie_info_id': 1291544,
 'movie_info_name': '哈利·波特与阿兹卡班的囚徒 Harry Potter and the Prisoner of Azkaban',
 'movie_info_image_url': 'https://img1.doubanio.com/view/photo/s_ratio_poster/public/p1910812549.webp',
 'movie_info_country': '英国,美国',
 'movie_info_language': '英语',
 'movie_info_pubdate': '2004-09-10(中国大陆),2004-05-31(英国)',
 'movie_info_duration': '141分钟',
 'movie_info_other_name': '哈利波特3：阿兹卡班的逃犯(港,台),哈利·波特与阿兹卡班的逃犯,哈3',
 'movie_info_summary': '哈利（丹尼尔·雷德克里夫）即将在霍格沃兹渡过第三个年头，此时在阿兹塔班却传出恶棍小天狼星（加里·奥德曼）越狱的消息。据说小天狼星正是背叛哈利父母的好友，他的教父，而这次小天狼星越狱似乎正是为了找他。哈利的心里悄悄的滋生了为父母报仇的想法，期待着小天狼星的出现。新来的魔法老师卢平（大卫·休里斯）有着桀骜不驯的个性，与哈利关系很好，教了哈利许多实用的黑魔法防御知识。而此时在他的魔法地图上却出现了一个神秘人物，虫尾巴。偶然的机会下，在尖叫棚屋里集齐了哈利父亲当年的所有好友，卢平、虫尾巴、小天狼星，哈利的魔杖直指向小天狼星，但他发现那罪犯看他的眼神里充满了疼爱……',
 'movie_info_rating': '8.4',
 'movie_info_review_count': '233801'}

## 提取电影演员(演员、编剧、导演)信息到`movie_person`之中

In [14]:
# sql = "DROP TABLE  movie_person;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

sql = '''
create table movie_person
(
  movie_person_id            int          not null
    primary key,
  movie_person_name          text         null,
  movie_person_image_url     varchar(200) null,
  movie_person_gender        varchar(100) null,
  movie_person_constellation varchar(200) null,
  movie_person_birthday      varchar(200) null,
  movie_person_birthplace    text         null,
  movie_person_profession    varchar(200) null,
  movie_person_other_name    text         null,
  movie_person_introduction  mediumtext   null
);
'''

with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [15]:
movie_person_file_path = '../data/movie_kg/txt/small_movie_person_info.txt'
movie_person_str_list = open(movie_person_file_path, 'r').readlines()
movie_person_json_list = [json.loads(movie) for movie in movie_person_str_list]

try:
    with kg_conn.cursor() as cursor:
        for movie_person in movie_person_json_list:
            movie_person_id = movie_person['id']
            movie_person_name = movie_person['name']
            movie_person_image_url = movie_person['image_url']
            movie_person_gender = movie_person['gender']
            movie_person_constellation = movie_person['constellation']
            movie_person_birthday = movie_person['birthday']
            movie_person_birthplace = movie_person['birthplace']
            movie_person_profession = movie_person['profession']
            movie_person_other_name = movie_person['other_english_name'] + movie_person['other_chinese_name']
            movie_person_introduction = movie_person['introduction']

            sql = "INSERT INTO `movie_person` (`movie_person_id`, `movie_person_name`, `movie_person_image_url`," \
                    "`movie_person_gender`, `movie_person_constellation`, `movie_person_birthday`, `movie_person_birthplace`," \
                    "`movie_person_profession`, `movie_person_other_name`, " \
                    "`movie_person_introduction`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            cursor.execute(sql, (movie_person_id, movie_person_name, movie_person_image_url, movie_person_gender,movie_person_constellation,movie_person_birthday, movie_person_birthplace, movie_person_profession,movie_person_other_name, movie_person_introduction))
        kg_conn.commit()
except Exception as err:
    print('movie_person数据插入错误' + str(err))

In [16]:
sql = "SELECT * FROM movie_person;"
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchone()
result

{'movie_person_id': 1000045,
 'movie_person_name': '马修·布罗德里克 Matthew Broderick',
 'movie_person_image_url': 'https://img1.doubanio.com/view/celebrity/s_ratio_celebrity/public/p2777.webp',
 'movie_person_gender': '男',
 'movie_person_constellation': ':白羊座',
 'movie_person_birthday': '1962-03-21',
 'movie_person_birthplace': '美国,纽约',
 'movie_person_profession': '演员/配音/导演/制片人',
 'movie_person_other_name': '',
 'movie_person_introduction': "天生一副娃娃脸，成长于纽约人文气息浓厚的格林威治村，父亲是著名的舞台兼电影演员詹姆斯·布罗德里克，母亲是帕特里夏·布罗德里克。高中时马修对运动、戏剧都有兴趣，由于膝盖受伤，他只得把精力放在后者，17岁时并和父亲同台演出。毕业后他参加舞台剧《同性三分亲》(1988年搬上银幕，仍由马修主演)，由于颇受好评，让他获得拍片机会。《逃课天才》推出后，马修顿时声名大噪。限于外形，他多半饰演青少年角色，在《光荣战役》中，他尝试成熟角色，试图摆脱以往乳臭未干的感觉。后终以《哥斯拉》等片跃升卖座明星。多年来作品不断，类型多样。1997年5月，他与《欲望都市》的女星莎拉·杰茜卡·帕克结婚。之前，他曾经和海伦·亨特，詹妮弗·格瑞订婚，并且约会过莉莉·泰勒。以大男孩般迷人魅力著称的轻喜剧演员马修·布罗德里克，1962年3月出生在纽约市，后在WaldenSchool学习表演，17岁在演员父亲的帮助下首次登台表演。在舞台剧《布里顿海滩》（BrightonBeachMemoirs）和影片《贵客光临》（MaxDuganReturns）的推动下，他的事业开始加速前进。1983年他在轻喜剧《战争游戏》（WarGames）中首次取得银幕事业的一大成功，随后的《X计划》（ProjectX）、《不知不觉爱上你》（AddictedtoLove）和

## 电影和演员之间进行关联`actor_to_movie`

In [17]:
# sql = "DROP TABLE  actor_to_movie;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

sql = '''
create table actor_to_movie
(
  movie_info_id  int not null,
  movie_actor_id int not null,
  primary key (movie_info_id, movie_actor_id),
  constraint actor_to_movie_movie
  foreign key (movie_info_id) references movie_info (movie_info_id),
  constraint actor_to_movie_person
  foreign key (movie_actor_id) references movie_person (movie_person_id)
);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [18]:
try:
    with kg_conn.cursor() as cursor:
        for movie in movie_json_list:
            movie_info_id = movie['id']
            movie_actor = movie['actors']
            movie_actor_id = [actor['id'] for actor in movie_actor]
            movie_actor_id = list(set(movie_actor_id))
            if '' in movie_actor_id:
                movie_actor_id.remove('')
            for actor_id in movie_actor_id:
                # 判断movie_person中是否存在该ID
                is_existed_sql = 'select 1 from `movie_person` where `movie_person_id` = ' + '\'' + actor_id + '\'' + 'limit 1'
                cursor.execute(is_existed_sql)
                is_existed = cursor.fetchone()
                if is_existed is None:
                    continue
                actor_to_movie_sql = "INSERT INTO `actor_to_movie` (`movie_info_id`, `movie_actor_id`) VALUES (%s, %s)"
                cursor.execute(actor_to_movie_sql, (movie_info_id, actor_id))
        kg_conn.commit()
except Exception as err:
    print('actor_to_movie数据插入错误' + str(err))

In [19]:
sql = '''
create index actor_to_movie_person
  on actor_to_movie (movie_actor_id);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [20]:
sql = "SELECT * FROM actor_to_movie;"
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchmany(10)
result

[{'movie_info_id': 1301753, 'movie_actor_id': 1000045},
 {'movie_info_id': 25980443, 'movie_actor_id': 1000045},
 {'movie_info_id': 1292052, 'movie_actor_id': 1000095},
 {'movie_info_id': 6878446, 'movie_actor_id': 1000145},
 {'movie_info_id': 30377703, 'movie_actor_id': 1000145},
 {'movie_info_id': 26325320, 'movie_actor_id': 1000147},
 {'movie_info_id': 2161696, 'movie_actor_id': 1000182},
 {'movie_info_id': 3319755, 'movie_actor_id': 1000182},
 {'movie_info_id': 1295644, 'movie_actor_id': 1000208},
 {'movie_info_id': 2028677, 'movie_actor_id': 1000220}]

## 电影和编剧之间进行关联 `writer_to_movie`

In [21]:
# sql = "DROP TABLE  writer_to_movie;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

sql = '''
create table writer_to_movie
(
  movie_info_id   int not null,
  movie_writer_id int not null,
  primary key (movie_info_id, movie_writer_id),
  constraint writer_to_movie_movie
  foreign key (movie_info_id) references movie_info (movie_info_id),
  constraint writer_to_movie_person
  foreign key (movie_writer_id) references movie_person (movie_person_id)
);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [22]:
try:
    with kg_conn.cursor() as cursor:
        for movie in movie_json_list:
            movie_info_id = movie['id']
            movie_writer = movie['writers']
            movie_writer_id = [writer['id'] for writer in movie_writer]
            movie_writer_id = list(set(movie_writer_id))
            if '' in movie_writer_id:
                movie_writer_id.remove('')
            for writer_id in movie_writer_id:
                # 判断movie_person中是否存在该ID
                is_existed_sql = 'select 1 from `movie_person` where `movie_person_id` = ' + '\'' + writer_id + '\'' + 'limit 1'
                cursor.execute(is_existed_sql)
                is_existed = cursor.fetchone()
                if is_existed is None:
                    continue
                writer_to_movie_sql = "INSERT INTO `writer_to_movie` (`movie_info_id`, `movie_writer_id`) VALUES (%s, %s)"
                cursor.execute(writer_to_movie_sql, (movie_info_id, writer_id))
        kg_conn.commit()
except Exception as err:
    print('actor_to_movie数据插入错误' + str(err))

In [23]:
sql = '''
create index writer_to_movie_person
  on writer_to_movie (movie_writer_id);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [24]:
sql = "SELECT * FROM writer_to_movie;"
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchmany(10)
result

[{'movie_info_id': 5300054, 'movie_writer_id': 1000371},
 {'movie_info_id': 6803494, 'movie_writer_id': 1000371},
 {'movie_info_id': 1292720, 'movie_writer_id': 1000393},
 {'movie_info_id': 1485260, 'movie_writer_id': 1000393},
 {'movie_info_id': 2129039, 'movie_writer_id': 1004746},
 {'movie_info_id': 25954475, 'movie_writer_id': 1004746},
 {'movie_info_id': 1381964, 'movie_writer_id': 1005177},
 {'movie_info_id': 2043546, 'movie_writer_id': 1005177},
 {'movie_info_id': 26683290, 'movie_writer_id': 1005177},
 {'movie_info_id': 1292001, 'movie_writer_id': 1005822}]

## 电影和电影类别之间进行关联

In [25]:
# sql = "DROP TABLE  movie_to_genre;"
# with kg_conn.cursor() as cursor:
#     cursor.execute(sql)
#     kg_conn.commit()

sql = '''
create table movie_to_genre
(
  movie_info_id  int not null,
  movie_genre_id int not null,
  primary key (movie_info_id, movie_genre_id),
  constraint movie_to_genre_movie
  foreign key (movie_info_id) references movie_info (movie_info_id),
  constraint movie_to_genre_genre
  foreign key (movie_genre_id) references movie_genre (movie_genre_id)
);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [26]:
try:
    with kg_conn.cursor() as cursor:
        for movie in movie_json_list:
            movie_info_id = movie['id']
            movie_genres = movie['genres']
            for genre in movie_genres:
                movie_genre_id_sql = "SELECT `movie_genre_id` FROM `movie_genre` where `movie_genre_name`=" + '\'' + str(
                    genre) + '\''
                cursor.execute(movie_genre_id_sql)
                movie_genre_info = cursor.fetchone()
                movie_genre_id = movie_genre_info['movie_genre_id']

                # 插入到movie_to_genre之中
                # print(str(movie_info_id) + ':' + str(movie_genre_id))
                movie_to_genre_sql = 'INSERT INTO `movie_to_genre` (`movie_info_id`, `movie_genre_id`) VALUES (%s, %s)'
                cursor.execute(movie_to_genre_sql, (movie_info_id, movie_genre_id))
        kg_conn.commit()

except Exception as err:
    print('movie_to_genre数据插入错误' + str(err))

In [27]:
sql = '''
create index movie_to_genre_idx
  on movie_to_genre (movie_genre_id);
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    kg_conn.commit()

In [28]:
sql = "SELECT * FROM movie_to_genre;"
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchmany(10)
result

[{'movie_info_id': 1291545, 'movie_genre_id': 1},
 {'movie_info_id': 1291568, 'movie_genre_id': 1},
 {'movie_info_id': 1291818, 'movie_genre_id': 1},
 {'movie_info_id': 1292062, 'movie_genre_id': 1},
 {'movie_info_id': 1292365, 'movie_genre_id': 1},
 {'movie_info_id': 1292434, 'movie_genre_id': 1},
 {'movie_info_id': 1294766, 'movie_genre_id': 1},
 {'movie_info_id': 1294833, 'movie_genre_id': 1},
 {'movie_info_id': 1300064, 'movie_genre_id': 1},
 {'movie_info_id': 1301617, 'movie_genre_id': 1}]

## 显示所有 Table

In [29]:
sql = '''
SHOW TABLES;
'''
with kg_conn.cursor() as cursor:
    cursor.execute(sql)
    result = cursor.fetchall()
result

[{'Tables_in_movie_kg': 'actor_to_movie'},
 {'Tables_in_movie_kg': 'movie_genre'},
 {'Tables_in_movie_kg': 'movie_info'},
 {'Tables_in_movie_kg': 'movie_person'},
 {'Tables_in_movie_kg': 'movie_to_genre'},
 {'Tables_in_movie_kg': 'writer_to_movie'}]

RDB转换成RDF有两种方式，一是direct mapping，即直接映射。另一种为R2RDF(RDB to RDF Mapping Language)，链接为https://www.w3.org/TR/r2rml/。


下面我们使用D2RQ工具将RDB数据转换到RDF形式。
直接映射规则为:

- 数据库的表作为本体中的类（Class）。
- 表的列作为属性（Property）。
- 表的行作为实例/资源。
- 表的单元格值为字面量。
- 如果单元格所在的列是外键，那么其值为IRI，或者说实体/资源。

## 在命令行中执行以下命令生成CSV

```
bin/mysql -A movie_kg -uroot -ppassword -e 'select * from movie_person;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/movie_person.csv

bin/mysql -A movie_kg -uroot -ppassword -e 'select * from movie_info;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/movie_info.csv

bin/mysql -A movie_kg -uroot -ppassword -e 'select * from movie_genre;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/movie_genre.csv

bin/mysql -A movie_kg -uroot -ppassword -e 'select * from movie_to_genre;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/movie_to_genre.csv

bin/mysql -A movie_kg -uroot -ppassword -e 'select * from actor_to_movie;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/actor_to_movie.csv

bin/mysql -A movie_kg -uroot -ppassword -e 'select * from writer_to_movie;' > ~/code/git/fhaoguo/KnowledgeGraph/data/movie_kg/csv/writer_to_movie.csv
```

## 编辑属性

In [30]:
movie_person = pd.read_table("../data/movie_kg/csv/movie_person.csv")
columns = list(movie_person.columns)
for i, name in enumerate(columns):
    if i == 0:
        columns[i] = columns[i] + ":ID"
    else:
        columns[i] = columns[i] + ":string"
movie_person.columns = columns
movie_person[":LABEL"] = "person"
movie_person.to_csv("../data/movie_kg/csv/movie_person.csv", index = None)

In [31]:
movie_info = pd.read_table("../data/movie_kg/csv/movie_info.csv")
columns = list(movie_info.columns)
for i, name in enumerate(columns):
    if i == 0:
        columns[i] = columns[i] + ":ID"
    else:
        columns[i] = columns[i] + ":string"
movie_info.columns = columns
movie_info[":LABEL"] = "movie"
movie_info.to_csv("../data/movie_kg/csv/movie_info.csv", index = None)

In [32]:
movie_genre = pd.read_table("../data/movie_kg/csv/movie_genre.csv")
columns = list(movie_genre.columns)
for i, name in enumerate(columns):
    if i == 0:
        columns[i] = columns[i] + ":ID"
    else:
        columns[i] = columns[i] + ":string"
movie_genre.columns = columns
movie_genre[":LABEL"] = "genre"
movie_genre.to_csv("../data/movie_kg/csv/movie_genre.csv", index = None)

## Clean Relationships

In [33]:
movie_info_set = set(movie_info["movie_info_id:ID"].to_list())
movie_genre_set = set(movie_genre["movie_genre_id:ID"].to_list())
movie_person_set = set(movie_person["movie_person_id:ID"].to_list())

In [34]:
movie_to_genre = pd.read_table("../data/movie_kg/csv/movie_to_genre.csv")
movie_to_genre.columns = [":START_ID", ":END_ID"]
movie_to_genre = movie_to_genre[movie_to_genre[":START_ID"].isin(movie_info_set) & movie_to_genre[":END_ID"].isin(movie_genre_set)]
movie_to_genre[":TYPE"] = "类型"
movie_to_genre.to_csv("../data/movie_kg/csv/movie_to_genre2.csv", index = None)

In [35]:
actor_to_movie = pd.read_table("../data/movie_kg/csv/actor_to_movie.csv")
actor_to_movie.columns = [":START_ID", ":END_ID"]
actor_to_movie = actor_to_movie[actor_to_movie[":START_ID"].isin(movie_info_set) &  actor_to_movie[":END_ID"].isin(movie_person_set)]
actor_to_movie[":TYPE"] = "演员"
actor_to_movie.to_csv("../data/movie_kg/csv/actor_to_movie2.csv", index = None)

In [36]:
writer_to_movie = pd.read_table("../data/movie_kg/csv/writer_to_movie.csv")
writer_to_movie.columns = [":START_ID", ":END_ID"]
writer_to_movie = writer_to_movie[writer_to_movie[":START_ID"].isin(movie_info_set) &  writer_to_movie[":END_ID"].isin(movie_person_set)]
writer_to_movie[":TYPE"] = "编剧"
writer_to_movie.to_csv("../data/movie_kg/csv/writer_to_movie2.csv", index = None)

## CSV导入Neo4j（ONgDB）

```
bin/ongdb-admin import --database graph.db --nodes=import/movie_genre.csv --nodes=import/movie_info.csv --nodes=import/movie_person.csv --relationships=import/actor_to_movie2.csv --relationships=import/movie_to_genre2.csv --relationships=import/writer_to_movie2.csv
```

## 效果预览

执行以下Cypher语句
```
match p=(:person)<-[:`演员`|:`编剧`]-(:movie)-[:`编剧`]->(:person)<-[:`演员`]-(:movie)-[:`编剧`|:`演员`]->(:person) return p limit 50
```
结果如下：

![](./graph.png)