In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
def get_page_source_code(url):
    """
    :param url: 需要爬取的网页的url
    :return: 返回网页的源代码
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    return soup

In [3]:
source_code = get_page_source_code('http://www.chinadaily.com.cn/')

In [4]:
source_code.a

<a class="a-center" href="//www.chinadaily.com.cn" shape="rect" target="_top"><img alt="chinadaily" src="//www.chinadaily.com.cn/image_e/2017/logo.png"/>
</a>

In [5]:
source_code.a.string

In [13]:
for item in source_code.find_all('a'):
    print(item)
    print(item.find_all('img'))

<a class="a-center" href="//www.chinadaily.com.cn" shape="rect" target="_top"><img alt="chinadaily" src="//www.chinadaily.com.cn/image_e/2017/logo.png"/>
</a>
[<img alt="chinadaily" src="//www.chinadaily.com.cn/image_e/2017/logo.png"/>]
<a href="//cn.chinadaily.com.cn" shape="rect" target="_top"><img src="//www.chinadaily.com.cn/image_e/2017/cnbut.png"/></a>
[<img src="//www.chinadaily.com.cn/image_e/2017/cnbut.png"/>]
<a href="//newssearch.chinadaily.com.cn/en/search" shape="rect" target="_top"><span>Search</span></a>
[]
<a atremote="1" href="//www.chinadaily.com.cn" shape="rect" target="_top">HOME</a>
[]
<a href="//www.chinadaily.com.cn/china" shape="rect" target="_top">CHINA</a>
[]
<a href="//www.chinadaily.com.cn/world" shape="rect" target="_top">WORLD</a>
[]
<a href="//www.chinadaily.com.cn/business" shape="rect" target="_top">BUSINESS</a>
[]
<a href="//www.chinadaily.com.cn/life" shape="rect" target="_top">LIFESTYLE</a>
[]
<a href="//www.chinadaily.com.cn/culture" shape="rect" ta

In [30]:
# 长度如果很小句子中的单词数量就会很少，故选取长度大于等于30的句子
sentence_set = []
for item in source_code.find_all('a'):
    tmp = item.string
    if tmp and len(tmp) >= 30:
        sentence_set.append(tmp)

In [31]:
sentence_set

['Xi: Nation can win battle against novel coronavirus',
 "Impressions of battle's front line",
 'Disinfection robots put to work fighting COVID-19',
 'First 28 patients discharged from makeshift hospital',
 'Xi: Nation can win battle against novel coronavirus',
 "Impressions of battle's front line",
 'Disinfection robots put to work fighting COVID-19',
 'First 28 patients discharged from makeshift hospital',
 'Xi says nation can win total victory in NCP fight',
 'Fewer new infections, more cured',
 'Law change aims to further protect wildlife',
 'Epidemic impact on China to be limited: Global firms',
 'Ministry advises online classes not to overburden students',
 "Fujian aids Yichang's epidemic controls in Hubei",
 'First 28 patients discharged from makeshift hospital',
 'Medical teams around China support Hubei',
 'Amazing China: Star mountain on the Pamir Plateau',
 'Indonesian Ambassador: BRI is open and inclusive',
 "Official: Azerbaijan's businessmen eager to set up firms in China

In [44]:
sentence_set = list(set(sentence_set)) # 有的句子会存在重复，在这里去重

In [45]:
sentence_set

['2020 Spring Festival travel rush',
 'Smartphone makers endeavor to keep factory lines running',
 'Using AI to offer a new beginning',
 "Outbreak won't affect nation's goals for developing",
 'Cloud tools becoming popular with more employees',
 "Hunan's culture & tourism authority welcomes foreigners",
 'State Administration of Foreign Experts Affairs | ',
 'How to improve body and mind at home',
 'Full steam ahead for Kenyan railway',
 'Courts make use of online platforms amid epidemic',
 'Xi says nation can win total victory in NCP fight',
 'Intelligent robots join the virus battle',
 'Chinese communities in New Zealand rally to help motherland fight novel coronavirus',
 'Racist reports infect the truth with prejudice',
 'Andrew Yang to suspend bid for Democratic presidential nomination',
 'Infographics: NHC guides on how to choose masks',
 'Annual Bali Spirit Festival to return in late March',
 'Online loan sharks losing their bite',
 'Virus-hit city shuts down local transportation

In [38]:
def generate_wordlist_from_sentence(sentence):
    """
    :param sentence: sentence应为str类型，将字符串sentence根据分隔符分割为多个单词，且将除英文字母和数字之外的字符都看成分隔符
    :return: 返回一个包含了多个单词的list
    """
    ans = []
    lst = 0
    for i, ch in enumerate(sentence):
        if not ch.isalnum():
            if lst < i:
                ans.append(sentence[lst:i].lower())
            lst = i+1
    if ch and ch.isalnum():
        ans.append(sentence[lst:i+1].lower())
    return ans

In [39]:
generate_wordlist_from_sentence(sentence_set[0])

['xi', 'nation', 'can', 'win', 'battle', 'against', 'novel', 'coronavirus']

In [48]:
#对不同的单词的数量做预统计
word_set = set()
for sentence in sentence_set:
    word_list = generate_wordlist_from_sentence(sentence)
    print(word_list)
    for word in word_list:
        word_set.add(word)

['2020', 'spring', 'festival', 'travel', 'rush']
['smartphone', 'makers', 'endeavor', 'to', 'keep', 'factory', 'lines', 'running']
['using', 'ai', 'to', 'offer', 'a', 'new', 'beginning']
['outbreak', 'won', 't', 'affect', 'nation', 's', 'goals', 'for', 'developing']
['cloud', 'tools', 'becoming', 'popular', 'with', 'more', 'employees']
['hunan', 's', 'culture', 'tourism', 'authority', 'welcomes', 'foreigners']
['state', 'administration', 'of', 'foreign', 'experts', 'affairs']
['how', 'to', 'improve', 'body', 'and', 'mind', 'at', 'home']
['full', 'steam', 'ahead', 'for', 'kenyan', 'railway']
['courts', 'make', 'use', 'of', 'online', 'platforms', 'amid', 'epidemic']
['xi', 'says', 'nation', 'can', 'win', 'total', 'victory', 'in', 'ncp', 'fight']
['intelligent', 'robots', 'join', 'the', 'virus', 'battle']
['chinese', 'communities', 'in', 'new', 'zealand', 'rally', 'to', 'help', 'motherland', 'fight', 'novel', 'coronavirus']
['racist', 'reports', 'infect', 'the', 'truth', 'with', 'prejudic

In [47]:
word_set

{'000',
 '19',
 '2020',
 '24',
 '27',
 '28',
 '5',
 '5g',
 'a',
 'abbas',
 'administration',
 'admission',
 'advantages',
 'advises',
 'affairs',
 'affect',
 'against',
 'age',
 'ahead',
 'ai',
 'aids',
 'aims',
 'amazing',
 'ambassador',
 'ambassadors',
 'amid',
 'ancient',
 'and',
 'andrew',
 'animal',
 'annual',
 'anti',
 'areas',
 'around',
 'as',
 'asia',
 'at',
 'authority',
 'auto',
 'azerbaijan',
 'bali',
 'balkans',
 'ban',
 'barazite',
 'battered',
 'battle',
 'be',
 'beat',
 'becoming',
 'before',
 'begin',
 'beginning',
 'belgian',
 'bid',
 'bite',
 'bloom',
 'blue',
 'body',
 'book',
 'boom',
 'boredom',
 'breakout',
 'bri',
 'bucolic',
 'builds',
 'business',
 'businessmen',
 'calendar',
 'cambodia',
 'can',
 'candy',
 'cash',
 'celtics',
 'champions',
 'change',
 'cheer',
 'chicago',
 'children',
 'china',
 'chinese',
 'choice',
 'choir',
 'choose',
 'city',
 'clarifies',
 'classes',
 'cloud',
 'color',
 'communities',
 'composes',
 'confidence',
 'consumption',
 'contro

In [100]:
len(word_set) # 单词数太多，所以只取前20个出现次数最多的单词

20

In [74]:
# 统计每个单词出现的次数
word_count = {}
for sentence in sentence_set:
    word_list = generate_wordlist_from_sentence(sentence)
    for word in word_list:
        word_count[word] = word_count.get(word, 0)+1

In [75]:
word_count

{'2020': 1,
 'spring': 1,
 'festival': 2,
 'travel': 1,
 'rush': 1,
 'smartphone': 1,
 'makers': 1,
 'endeavor': 1,
 'to': 24,
 'keep': 1,
 'factory': 1,
 'lines': 1,
 'running': 1,
 'using': 1,
 'ai': 1,
 'offer': 2,
 'a': 4,
 'new': 4,
 'beginning': 1,
 'outbreak': 2,
 'won': 1,
 't': 1,
 'affect': 1,
 'nation': 3,
 's': 12,
 'goals': 1,
 'for': 9,
 'developing': 1,
 'cloud': 1,
 'tools': 1,
 'becoming': 1,
 'popular': 1,
 'with': 5,
 'more': 2,
 'employees': 1,
 'hunan': 1,
 'culture': 2,
 'tourism': 1,
 'authority': 1,
 'welcomes': 1,
 'foreigners': 1,
 'state': 1,
 'administration': 1,
 'of': 10,
 'foreign': 1,
 'experts': 1,
 'affairs': 1,
 'how': 3,
 'improve': 1,
 'body': 1,
 'and': 4,
 'mind': 1,
 'at': 3,
 'home': 3,
 'full': 1,
 'steam': 1,
 'ahead': 1,
 'kenyan': 1,
 'railway': 1,
 'courts': 1,
 'make': 1,
 'use': 1,
 'online': 5,
 'platforms': 1,
 'amid': 3,
 'epidemic': 6,
 'xi': 2,
 'says': 1,
 'can': 2,
 'win': 3,
 'total': 1,
 'victory': 1,
 'in': 14,
 'ncp': 1,
 'figh

In [76]:
sorted(word_count.items(), key = lambda item: item[1], reverse=True)

[('to', 24),
 ('in', 14),
 ('s', 12),
 ('china', 11),
 ('of', 10),
 ('for', 9),
 ('the', 7),
 ('epidemic', 6),
 ('with', 5),
 ('online', 5),
 ('fight', 5),
 ('virus', 5),
 ('coronavirus', 5),
 ('on', 5),
 ('as', 5),
 ('a', 4),
 ('new', 4),
 ('and', 4),
 ('up', 4),
 ('from', 4),
 ('nation', 3),
 ('how', 3),
 ('at', 3),
 ('home', 3),
 ('amid', 3),
 ('win', 3),
 ('battle', 3),
 ('help', 3),
 ('their', 3),
 ('local', 3),
 ('students', 3),
 ('against', 3),
 ('time', 3),
 ('support', 3),
 ('children', 3),
 ('festival', 2),
 ('offer', 2),
 ('outbreak', 2),
 ('more', 2),
 ('culture', 2),
 ('xi', 2),
 ('can', 2),
 ('robots', 2),
 ('chinese', 2),
 ('novel', 2),
 ('masks', 2),
 ('during', 2),
 ('exhibitions', 2),
 ('beat', 2),
 ('boredom', 2),
 ('stress', 2),
 ('part', 2),
 ('be', 2),
 ('global', 2),
 ('firms', 2),
 ('change', 2),
 ('first', 2),
 ('hospital', 2),
 ('is', 2),
 ('hubei', 2),
 ('giants', 2),
 ('ramp', 2),
 ('us', 2),
 ('song', 2),
 ('it', 2),
 ('times', 2),
 ('expats', 2),
 ('animal

In [82]:
# 取出现次数最多的前20个单词作为vocabulary，如果单词不足20个，则取全部
word_set = set()
for i, item in enumerate(sorted(word_count.items(), key = lambda item: item[1], reverse=True)):
    if i >= 20:
        break
    word_set.add(item[0])

In [83]:
word_set

{'a',
 'and',
 'as',
 'china',
 'coronavirus',
 'epidemic',
 'fight',
 'for',
 'from',
 'in',
 'new',
 'of',
 'on',
 'online',
 's',
 'the',
 'to',
 'up',
 'virus',
 'with'}

In [92]:
# 统计每个句子中每个单词出现的次数，并过滤掉不在vocabulary中的单词，将剩下的单词的出现次数组合为这个句子的向量表示
representations = []
for i, sentence in enumerate(sentence_set):
    representations.append([0]*len(word_set))
    word_list = generate_wordlist_from_sentence(sentence)
    word_count_tmp = {}
    
    for word in word_list:
        word_count_tmp[word] = word_count_tmp.get(word, 0)+1

    j = 0
    for item in word_count_tmp.keys():
        if item in word_set:
            representations[i][j] = word_count_tmp[item]
            j += 1

In [93]:
import pandas as pd

In [97]:
sentence_set

['2020 Spring Festival travel rush',
 'Smartphone makers endeavor to keep factory lines running',
 'Using AI to offer a new beginning',
 "Outbreak won't affect nation's goals for developing",
 'Cloud tools becoming popular with more employees',
 "Hunan's culture & tourism authority welcomes foreigners",
 'State Administration of Foreign Experts Affairs | ',
 'How to improve body and mind at home',
 'Full steam ahead for Kenyan railway',
 'Courts make use of online platforms amid epidemic',
 'Xi says nation can win total victory in NCP fight',
 'Intelligent robots join the virus battle',
 'Chinese communities in New Zealand rally to help motherland fight novel coronavirus',
 'Racist reports infect the truth with prejudice',
 'Andrew Yang to suspend bid for Democratic presidential nomination',
 'Infographics: NHC guides on how to choose masks',
 'Annual Bali Spirit Festival to return in late March',
 'Online loan sharks losing their bite',
 'Virus-hit city shuts down local transportation

In [99]:
pd.DataFrame(representations, index = sentence_set)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
2020 Spring Festival travel rush,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Smartphone makers endeavor to keep factory lines running,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Using AI to offer a new beginning,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Outbreak won't affect nation's goals for developing,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cloud tools becoming popular with more employees,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Hunan's culture & tourism authority welcomes foreigners,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
State Administration of Foreign Experts Affairs |,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
How to improve body and mind at home,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Full steam ahead for Kenyan railway,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Courts make use of online platforms amid epidemic,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
