In [None]:
import requests

import re
import urllib
import math
import time
import random

import pandas as pd
import sqlite3

In [None]:
my_headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Host': 'music.163.com',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}

In [None]:
def getJSON(url, headers):
    """
    @ param url:
    @ param headers:
    @ return json:
    """
    res = requests.get(url, headers=headers)
    res.raise_for_status()  
    res.encoding = 'utf-8'  
    json = res.json()
    return json

In [None]:
def countPages(total, limit):
    """
    @ param total:
    @ return page:
    """
    page = math.ceil(total/limit) 
    return page

In [None]:
def getSongInfo(song_list):
    """
    
    """
    song_info_list = []
    
    for song in song_list:
        song_info = []
    
        song_info.append(song['id'])
        song_info.append(song['name'])
    
        artists_name = ''
        artists = song['artists']
        for artist in artists:
            artists_name += artist['name'] + ','
        song_info.append(artists_name)
    
        song_info.append(song['album']['name'])
        song_info.append(song['album']['id'])
        song_info.append(song['duration'])
        
        song_info_list.append(song_info)
        
    return song_info_list

In [None]:
def getSongList(key, limit=30):
    """
    @ param key:
    @ return result:
    """
    total_list = []
    key = urllib.parse.quote(key)
    url = 'http://music.163.com/api/search/get/web?csrf_token=&hlpretag=&hlposttag=&s=' + key +  '&type=1&offset=0&total=true&limit='
    
    first_page = getJSON(url, my_headers)
    song_count = first_page['result']['songCount']
    page_num = countPages(song_count, limit)
    
    for n in range(page_num):
        url = 'http://music.163.com/api/search/get/web?csrf_token=&hlpretag=&hlposttag=&s=' + key +  '&type=1&offset=' + str(n) + '&total=true&limit=' + str(limit)
        tmp = getJSON(url, my_headers)
        song_list = getSongInfo(tmp['result']['songs'])
        total_list += song_list
        
        #print('第 {0}/{1} 页爬取完成'.format(n+1, page_num))
        time.sleep(random.randint(2, 4)) 
        
    df = pd.DataFrame(data = total_list, columns=['song_id', 'song_name', 'artists', 'album_name', 'album_id', 'duration'])
    return df

In [None]:
def getComment(comments):
    """
    
    """
    comments_list = []
    
    for comment in comments:
        comment_info = []
        comment_info.append(comment['commentId'])
        comment_info.append(comment['user']['userId'])
        comment_info.append(comment['user']['nickname'])
        comment_info.append(comment['user']['avatarUrl'])
        comment_info.append(comment['content'])
        comment_info.append(comment['likedCount'])
        comments_list.append(comment_info)
        
    return comments_list

In [None]:
def getSongComment(id, limit=20):
    """
    @ param id:
    @ return result:
    """
    total_comment = []
    url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_' + str(id) +  '?limit=20&offset=0'
    
    first_page = getJSON(url, my_headers)
    total = first_page['total']
    page_num = countPages(total, limit)
    
    for n in range(page_num):
        url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_' + str(id) +  '?limit=' + str(limit) + '&offset=' + str(n)
        tmp = getJSON(url, my_headers)
        comment_list = getComment(tmp['comments'])
        total_comment += comment_list
        
        print('第 {0}/{1} 页爬取完成'.format(n+1, page_num))
        time.sleep(random.randint(2, 4)) 
        
    df = pd.DataFrame(data = total_comment, columns=['comment_id', 'user_id', 'user_nickname', 'user_avatar', 'content', 'likeCount'])
    return df

In [None]:
conn = sqlite3.connect('netease_cloud_music.db')

In [None]:
artist='窦唯'

In [None]:
song_df = getSongList(artist, 100)
song_df = song_df[song_df['artists'].str.contains(artist)]
song_df.to_sql(name='song', con=conn, if_exists='append', index=False)

In [None]:
comment_df = pd.DataFrame()
for id in song_df['song_id']:
    print('开始爬取 {}'.format(id))
    tmp_df = getSongComment(id, limit=100)
    comment_df = pd.concat([comment_df, tmp_df])
comment_df.to_sql(name='comment', con=conn, if_exists='append', index=False)