# RDBMS (관계형 데이터베이스)
- sql언어를 사용
- 규격이 엄격하고, 다양한 규격이 있으면 매번 수정해주어야 한다
- Mysql, Oracle, Postgresql, SQLlte ...

# NoSQL
- sql을 사용하지 않음
- 고정된 스키마가 없다
- 정해진 규격이 엄격하지 않다
- Mongodb, redis, Hbase, cassandra...

# Mongodb
- json구조로 data(document)를 관리
- sql : database > table > data(row, column)
- mongodb : database > collection > document


In [None]:
# mongod --d

# http://localhost:27017

In [1]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.0.1-cp39-cp39-win_amd64.whl (354 kB)
Installing collected packages: pymongo
Successfully installed pymongo-4.0.1


In [5]:
import pymongo

In [6]:
conn = pymongo.MongoClient()

In [16]:
tdb = conn['testdb']

In [17]:
col_it = tdb['it']

In [18]:
post = {'author':'Mike', 'text':'my first blog post', 'tags':['mongodb','python','pymongo']}
col_it.insert_one(post)

<pymongo.results.InsertOneResult at 0x226d5f65ac0>

In [26]:
results = col_it.find()
for r in results:
    print(r)

{'_id': ObjectId('6203598fa2b361620b295440'), 'author': 'Mike', 'text': 'my first blog post', 'tags': ['mongodb', 'python', 'pymongo']}


In [85]:
col_it.insert_on({'author':'Dave Lee', 'age':45})

TypeError: 'Collection' object is not callable. If you meant to call the 'insert_on' method on a 'Collection' object it is failing because no such method exists.

In [30]:
results = col_it.find()
for r in results:
    print(r)

{'_id': ObjectId('6203598fa2b361620b295440'), 'author': 'Mike', 'text': 'my first blog post', 'tags': ['mongodb', 'python', 'pymongo']}
{'_id': ObjectId('62035a83a2b361620b295441'), 'author': 'Dave Ahn', 'age': 25}
{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}


In [28]:
col_it.insert_many(
    [
        {'author':'Dave Ahn', 'age':25},
        {'author':'Dave', 'age':35}
        
    ]
)

<pymongo.results.InsertManyResult at 0x226c64abd40>

In [31]:
results = col_it.find()
for r in results:
    print(r)

{'_id': ObjectId('6203598fa2b361620b295440'), 'author': 'Mike', 'text': 'my first blog post', 'tags': ['mongodb', 'python', 'pymongo']}
{'_id': ObjectId('62035a83a2b361620b295441'), 'author': 'Dave Ahn', 'age': 25}
{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}


In [None]:
* document insert하면서, _id(primary key)를 확인하는 법

In [33]:
post = {'author':'Dave', 'text':'my first blog post'}

post_id = col_it.insert_one(post)
post_id

<pymongo.results.InsertOneResult at 0x226d6083c80>

In [34]:
post_id.inserted_id

ObjectId('62035abda2b361620b295443')

In [35]:
col_it.count_documents({})

4

In [36]:
# col_it.count()

In [37]:
* 입력 : {}, 리스트, 중첩 딕셔너리

SyntaxError: invalid syntax (Temp/ipykernel_14328/4054086869.py, line 1)

In [86]:
col_it.insert_one({'title':'암살', 'castings':['이정재','전지현','하정우']})

<pymongo.results.InsertOneResult at 0x226d5e79280>

In [87]:
col_it.insert_one(
    {
        'title':'실미도',
        'castings':['설경구','안성기'],
        'datetime':
        {
            'year': '2003',
            'month': 3,
            'val':
            {
                'a':
                {
                    'b':1
                }
            }
        }
    }
)

<pymongo.results.InsertOneResult at 0x226db84fe80>

In [90]:
data = list()
data.append({'name':'aaron','age':20})
data.append({'name':'bob','age':22})
data.append({'name':'cathy','age':25})
data.append({'name':'erick','age':30})
data.append({'name':'aaron','age':20})
data.append({'name':'hnm'})

col_it.insert_many(data)

<pymongo.results.InsertManyResult at 0x226dab58f80>

In [91]:
col_it.count_documents({})

14

## document 검색하기

* find_one()

In [92]:
col_it.find_one()

{'_id': ObjectId('6203598fa2b361620b295440'),
 'author': 'Mike',
 'text': 'my first blog post',
 'tags': ['mongodb', 'python', 'pymongo']}

In [94]:
results = col_it.find()
for r in results:
    print(r)

{'_id': ObjectId('6203598fa2b361620b295440'), 'author': 'Mike', 'text': 'my first blog post', 'tags': ['mongodb', 'python', 'pymongo']}
{'_id': ObjectId('62035a83a2b361620b295441'), 'author': 'Dave Ahn', 'age': 25}
{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'my first blog post'}
{'_id': ObjectId('62035ac9a2b361620b295444'), 'title': '암살', 'castings': ['이정재', '전지현', '하정우']}
{'_id': ObjectId('62035af3a2b361620b295445'), 'title': '실미도', 'castings': ['설경구', '안성기'], 'datetime': {'year': '2003', 'month': 3, 'val': {'a': {'b': 1}}}}
{'_id': ObjectId('62038fd4a2b361620b2954d3'), 'title': '암살', 'castings': ['이정재', '전지현', '하정우']}
{'_id': ObjectId('62038fd5a2b361620b2954d4'), 'title': '실미도', 'castings': ['설경구', '안성기'], 'datetime': {'year': '2003', 'month': 3, 'val': {'a': {'b': 1}}}}
{'_id': ObjectId('620390fea2b361620b2954d5'), 'name': 'aaron', 'age': 20}
{'_id': ObjectId('620390fea2b361620b29

In [98]:
results = col_it.find({'author':'Dave'})
for r in results:
    print(r)

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'my first blog post'}


In [99]:
col_it.find_one({'author':'Dave'})

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}

In [100]:
col_it.count_documents({'author':'Dave'})

2

In [102]:
for r in col_it.find().sort('age'):
    print(r)           ###????

{'_id': ObjectId('6203598fa2b361620b295440'), 'author': 'Mike', 'text': 'my first blog post', 'tags': ['mongodb', 'python', 'pymongo']}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'my first blog post'}
{'_id': ObjectId('62035ac9a2b361620b295444'), 'title': '암살', 'castings': ['이정재', '전지현', '하정우']}
{'_id': ObjectId('62035af3a2b361620b295445'), 'title': '실미도', 'castings': ['설경구', '안성기'], 'datetime': {'year': '2003', 'month': 3, 'val': {'a': {'b': 1}}}}
{'_id': ObjectId('62038fd4a2b361620b2954d3'), 'title': '암살', 'castings': ['이정재', '전지현', '하정우']}
{'_id': ObjectId('62038fd5a2b361620b2954d4'), 'title': '실미도', 'castings': ['설경구', '안성기'], 'datetime': {'year': '2003', 'month': 3, 'val': {'a': {'b': 1}}}}
{'_id': ObjectId('620390fea2b361620b2954da'), 'name': 'hnm'}
{'_id': ObjectId('620390fea2b361620b2954d5'), 'name': 'aaron', 'age': 20}
{'_id': ObjectId('620390fea2b361620b2954d9'), 'name': 'aaron', 'age': 20}
{'_id': ObjectId('620390fea2b361620b2954d6'), 'name': 'bo

### document update: update_one(),update_many()

In [103]:
col_it.find_one({'author':'Dave'})


{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35}

In [104]:
col_it.update_one({"author":'Dave'},
                 {"$set": {'text': 'Hi Dave'}})

<pymongo.results.UpdateResult at 0x226db7a5d40>

In [107]:
col_it

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'testdb'), 'it')

In [108]:
for d in col_it.find({'author':'Dave'}):
    print(d)

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35, 'text': 'Hi Dave'}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'my first blog post'}


In [109]:
col_it.update_many({'author':'Dave'},
                  {"$set": {'text':'hi dave'}})

<pymongo.results.UpdateResult at 0x226d5ebad00>

In [110]:
for d in col_it.find({'author':'Dave'}):
    print(d)

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35, 'text': 'hi dave'}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'hi dave'}


## document delete:delete_one(), delete_many()

In [111]:
for d in col_it.find({'author':'Dave'}):
    print(d)

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35, 'text': 'hi dave'}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'hi dave'}


In [112]:
col_it.delete_one({'author':'Dave Lee'})

<pymongo.results.DeleteResult at 0x226db8ef800>

In [115]:
for d in col_it.find({'author':'Dave Lee'}):
    print(d)

In [116]:
for d in col_it.find({'author':'Dave'}):
    print(d)

{'_id': ObjectId('62035a83a2b361620b295442'), 'author': 'Dave', 'age': 35, 'text': 'hi dave'}
{'_id': ObjectId('62035abda2b361620b295443'), 'author': 'Dave', 'text': 'hi dave'}


In [118]:
col_it.delete_many({'author':'Dave'})

<pymongo.results.DeleteResult at 0x226db8354c0>

In [119]:
for d in col_it.find({'author':'Dave'}):
    print(d)

In [120]:
boos = conn.books


In [121]:
it_book = boos.it_books

In [122]:
data = list()
for index in range(100):
    data.append({'author':'Dave Lee','publisher':'bit_company','number': index})

In [123]:
data

[{'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 0},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 1},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 2},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 3},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 4},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 5},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 6},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 7},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 8},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 9},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 10},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 11},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 12},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 13},
 {'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 14},
 {'au

In [124]:
it_book.insert_many(data)

<pymongo.results.InsertManyResult at 0x226d5d68900>

In [125]:
docs = it_book.find()
for doc in docs:
    print(doc)

{'_id': ObjectId('620356dca2b361620b2953dc'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 0}
{'_id': ObjectId('620356dca2b361620b2953dd'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 1}
{'_id': ObjectId('620356dca2b361620b2953de'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 2}
{'_id': ObjectId('620356dca2b361620b2953df'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 3}
{'_id': ObjectId('620356dca2b361620b2953e0'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 4}
{'_id': ObjectId('620356dca2b361620b2953e1'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 5}
{'_id': ObjectId('62039270a2b361620b2954db'), 'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 0}
{'_id': ObjectId('62039270a2b361620b2954dc'), 'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 1}
{'_id': ObjectId('62039270a2b361620b2954dd'), 'author': 'Dave Lee', 'publisher': 'bit_company', 'number': 2}
{'_id': Objec

In [126]:
it_book.update_many({}, {'$set': {'publisher':'bit_camp_pub'}})

<pymongo.results.UpdateResult at 0x226db42f140>

In [127]:
docs = it_book.find()
for doc in docs:
    print(doc)

{'_id': ObjectId('620356dca2b361620b2953dc'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 0}
{'_id': ObjectId('620356dca2b361620b2953dd'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 1}
{'_id': ObjectId('620356dca2b361620b2953de'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 2}
{'_id': ObjectId('620356dca2b361620b2953df'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 3}
{'_id': ObjectId('620356dca2b361620b2953e0'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 4}
{'_id': ObjectId('620356dca2b361620b2953e1'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 5}
{'_id': ObjectId('62039270a2b361620b2954db'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 0}
{'_id': ObjectId('62039270a2b361620b2954dc'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 1}
{'_id': ObjectId('62039270a2b361620b2954dd'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 2}
{'_id': Ob

In [128]:
it_book.delete_many({'number':{'$gte':6}})         # 6보다 크거나 같은거 삭제

<pymongo.results.DeleteResult at 0x226db7a5940>

In [129]:
docs = it_book.find()
for doc in docs:
    print(doc)

{'_id': ObjectId('620356dca2b361620b2953dc'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 0}
{'_id': ObjectId('620356dca2b361620b2953dd'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 1}
{'_id': ObjectId('620356dca2b361620b2953de'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 2}
{'_id': ObjectId('620356dca2b361620b2953df'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 3}
{'_id': ObjectId('620356dca2b361620b2953e0'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 4}
{'_id': ObjectId('620356dca2b361620b2953e1'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 5}
{'_id': ObjectId('62039270a2b361620b2954db'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 0}
{'_id': ObjectId('62039270a2b361620b2954dc'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 1}
{'_id': ObjectId('62039270a2b361620b2954dd'), 'author': 'Dave Lee', 'publisher': 'bit_camp_pub', 'number': 2}
{'_id': Ob

In [None]:
# crawling cine21

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore') 

import matplotlib as mpl
import matplotlib.font_manager as fm

mpl.rcParams['axes.unicode_minus'] = False

path = 'C:/Windows/Fonts/malgun.ttf'
font_name = fm.FontProperties(fname=path, size=50).get_name()
plt.rc('font', family=font_name)

import requests
from bs4 import BeautifulSoup
from selenium import webdriver

import MySQLdb

In [130]:
import re

In [77]:
url = 'http://www.cine21.com/rank/person'

In [78]:
month = '2022-01'

data_a = {'section': 'actor',
    'period_start': month,
    'gender':'all',
    'page': 1}

In [79]:
res = requests.get(url, data = data)
soup = BeautifulSoup(res.text, 'html.parser')
soup

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="1641311652800771" property="fb:pages"/>
<meta content="vTM0gmeRzJwn1MIM1LMSp3cxP_SaBzch1ziRY255RHw" name="google-site-verification"/>
<meta content="5yOe6b_e_3rr7vNDwgXJw_8wLZQGx4lJ_V48KNPrqkA" name="google-site-verification"/>
<meta content="20defde86fc4464f2693891567a98905bd0a60d1" name="naver-site-verification"/>
<meta content="dmds9ks357rhqvdnk" name="dailymotion-domain-verification"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>씨네21</title>
<link href="/inc/www/css/default1.css" media="all" rel="stylesheet" type="text/css"/>
<link href="/inc/www/css/content1.css" media="all" rel="stylesheet" type="text/css"/>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.12.0/jquery.min.js"></script>
<meta conten

In [80]:
li_tags = soup.select('li.people_li')
len(li_tags)

0

In [81]:
main_url = 'http://www.cine21.com'

for t in tags:
    print(t.select('a')[0]['href'])
    print(re.sub("\(\w+\)", "", t.text)

SyntaxError: unexpected EOF while parsing (Temp/ipykernel_14328/2537906092.py, line 5)

In [71]:
actor_url = "https://image.cine21.com/resize/cine21/still/2017/1207/15_06_46__5a28da76c2e01[X145,145].jpg"

In [74]:
res = requests.get(actor_url)
soup = BeautifulSoup(res.text, 'html.parser')
actor_data = soup.select('ul.default_info')
actor_data

[]

In [75]:
actor_data[0].select('li')


IndexError: list index out of range

In [76]:
actor_info_dict = dict()

for li in actor_data[0].select('li'):
#     print(li)

    key = li.select_one('span.tit').text
    
    li = re.sub("<span.*?>.*?</span>", '', str(li))
    value = re.sub("<.+?>", "", li)
    
    actor_info_dict[key] = value.strip()
    
    print(key, value)
    
actor_info_dict

IndexError: list index out of range

In [None]:
### 흥행지수 뽑기

In [82]:
for s i in main_soup.select("li.people_li ul.num_info strong"):
    print(int(s.text.replace(',','')))

NameError: name 'main_soup' is not defined

In [83]:
movie_list = []
for s i in main_soup.select("li.people_li  ul.mov_list"):
    actor_movie = []
    for l in s.select('span'):
        actor_movie.append(l, text.strip())
    movie_list.append(actor_movie)

SyntaxError: invalid syntax (Temp/ipykernel_14328/2035497597.py, line 1)

In [None]:
movie_list

In [84]:
from bs4 import BeautifulSoup
import requests
import pymongo
import re

conn = pymongo.MongoClient()
actor_db = conn.cine21
actor_collection = actor_db.actor_collection

actors_info_list = list()

cine21_url = 'http://www.cine21.com/rank/person/content'
post_data = dict()
post_data['section'] = 'actor'
post_data['period_start'] = '2022-01'
post_data['gender'] = 'all'

for index in range(1, 21):
    post_data['page'] = index

    res = requests.post(cine21_url, data=post_data)
    soup = BeautifulSoup(res.content, 'html.parser')

    actors = soup.select('li.people_li div.name')
    hits = soup.select('ul.num_info > li > strong')
    movies = soup.select('ul.mov_list')
    rankings = soup.select('li.people_li > span.grade')
    
    for index, actor in enumerate(actors):
        actor_name = re.sub('\(\w*\)', '', actor.text)
        actor_hits = int(hits[index].text.replace(',', ''))
        movie_titles = movies[index].select('li a span')
        movie_title_list = list()
        for movie_title in movie_titles:
            movie_title_list.append(movie_title.text)
            
            
        actor_info_dict = dict()
        actor_info_dict['배우이름'] = actor_name
        actor_info_dict['흥행지수'] = actor_hits
        actor_info_dict['출연영화'] = movie_title_list
        actor_info_dict['랭킹'] = rankings[index].text

        actor_link = 'http://www.cine21.com' + actor.select_one('a').attrs['href']
        response_actor = requests.get(actor_link)
        soup_actor = BeautifulSoup(response_actor.content, 'html.parser')
        default_info = soup_actor.select_one('ul.default_info')
        actor_details = default_info.select('li')

        for actor_item in actor_details:
            actor_item_field = actor_item.select_one('span.tit').text
            actor_item_value = re.sub('<span.*?>.*?</span>', '', str(actor_item))
            actor_item_value = re.sub('<.*?>', '', actor_item_value)
            actor_info_dict[actor_item_field] = actor_item_value
        actors_info_list.append(actor_info_dict)
        
actor_collection.insert_many(actors_info_list)

<pymongo.results.InsertManyResult at 0x226db1b3580>

In [None]:
actor_collection.insert_many(actors_info_list)

In [None]:
results = actor_