# This notebook scrapes hashtags from main pages of brands in Weibo

In [1]:
#necessary installations
!pip install weibo-scraper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting weibo-scraper
  Downloading weibo_scraper-1.0.6-py2.py3-none-any.whl (8.5 kB)
Installing collected packages: weibo-scraper
Successfully installed weibo-scraper-1.0.6


In [2]:
from weibo_scraper import get_formatted_weibo_tweets_by_name
from weibo_scraper import  get_weibo_tweets
import re
import pandas as pd
from tqdm import tqdm

In [10]:
#dictionary with Chinese names of accounts
data_dict_en = {"BMW":"宝马中国"}
data_dict_cn = {"宝马中国":"BMW"}

In [24]:
def post_scraper(brand_name: str) -> list:
    """
    post_scraper collects metadata of posts from the main page of the brand in Weibo.  

    :param brand_name: exact name of the brand in Weibo.
    :return: list of posts in html text format
    """
    result_iterator = get_formatted_weibo_tweets_by_name(name=brand_name, pages=None)
    cnt = 0
    posts = []
    for user_meta in tqdm(result_iterator):
        if user_meta is not None:
            for tweetMeta in user_meta.cards_node:
                posts.append(tweetMeta.mblog.text)
                cnt = cnt + 1
                if cnt>100:
                  break
            else:
            # Continue if the inner loop wasn't broken.
              continue
            # Inner loop was broken, break the outer.
            break
    print(f'{data_dict_cn[brand_name]} has {cnt} number of posts')
    return posts
    

In [25]:
def hashtag_collector(posts: list) -> dict:
    """
    hashtag_collector gets hashtags from posts and convert them into dictionary in decreasing order.  

    :param posts: posts from the main page of the brand
    :return: dictionary of hashtags grouped by each post in decreasing order
    """
    filtered_post = []
    for post in posts:
        idx = [m.start() for m in re.finditer('#', post)]
        i = 0
        hashtags = []
        if ((len(idx)%2==0) and (len(idx)>0)):
            while i < len(idx):
                hashtags.append(post[idx[i]+1: idx[i+1]])
                i = i+2
            filtered_post.append(hashtags)
    dict_post = dict(list(enumerate(filtered_post)))
    return dict_post

In [26]:
def container_id_extractor(posts: list) -> list:
    """
    container_id_extractor collects container_id of posts from the metadata text of posts.  

    :param posts: posts extracted from Weibo
    :return: list of container_ids of all posts
    """ 
    container_list = []
    for post in posts:
        if 'containerid' in post:
            temp_list = post.split('containerid=')[1:]
            for text in temp_list:
                container_id = text.partition('&')[0]
                container_list.append(container_id)
    return container_list

In [27]:
def container_metadata_extractor(containers: list, brand: str):
    """
    container_metadata_extractor collects metadata from each post based on container_id  

    :param posts: posts extracted from Weibo
    :return: list of container_ids of all posts
    """ 
    brand_name = []
    created_at = []
    n_likes = []
    n_comments = []
    n_reposts = []
    container_id = []
    content = []
    cnt = 0
    for c_id in tqdm(containers):
        for tweet in get_weibo_tweets(tweet_container_id=c_id,pages=1):
            try:
                check1 = 'created_at' in tweet['mblog']
                check2 = 'attitudes_count' in tweet['mblog']
                check3 = 'comments_count' in tweet['mblog']
                check4 = 'reposts_count' in tweet['mblog']
                check5 = 'text' in tweet['mblog']
                
                if check1 and check2 and check3 and check4 and check5:
                    brand_name.append(brand)
                    created_at.append(tweet['mblog']['created_at'])
                    n_likes.append(tweet['mblog']['attitudes_count'])
                    n_comments.append(tweet['mblog']['comments_count'])
                    n_reposts.append(tweet['mblog']['reposts_count'])
                    content.append(tweet['mblog']['text'])
                    container_id.append(c_id)
                    cnt = cnt+1

                    if(cnt%100==0):
                        print(f'processed {cnt} tweets')
                        break

            except:
                print("tweet does not have attributes")
                continue
    
    df = pd.DataFrame(
    {
         'Brand': brand_name,
         'Created_at': created_at,
         'Likes': n_likes,
         'Comments': n_comments,
         'Reposts': n_reposts,
         'Content': content,
         'Container_ID': container_id
    })
    print(df)
    
    return df

In [28]:
accs = ["BMW"]
for acc in accs:
    posts = post_scraper(data_dict_en[acc])
    containers = container_id_extractor(posts)
    df = container_metadata_extractor(containers,acc)
    df.to_excel(acc+".xlsx", index=False)
    print(f"Completed processing {acc} data")

8it [00:16,  2.11s/it]


BMW has 80 number of posts


  1%|▏         | 1/77 [00:01<01:19,  1.05s/it]

tweet does not have attributes
tweet does not have attributes


  3%|▎         | 2/77 [00:02<01:21,  1.08s/it]

tweet does not have attributes
tweet does not have attributes


  5%|▌         | 4/77 [00:04<01:25,  1.17s/it]

tweet does not have attributes
tweet does not have attributes


 10%|█         | 8/77 [00:10<01:33,  1.35s/it]

tweet does not have attributes
tweet does not have attributes


 13%|█▎        | 10/77 [00:13<01:29,  1.33s/it]

tweet does not have attributes
tweet does not have attributes


 14%|█▍        | 11/77 [00:14<01:27,  1.33s/it]

tweet does not have attributes
tweet does not have attributes


 18%|█▊        | 14/77 [00:18<01:26,  1.38s/it]

tweet does not have attributes
tweet does not have attributes


 22%|██▏       | 17/77 [00:23<01:21,  1.36s/it]

tweet does not have attributes
tweet does not have attributes


 23%|██▎       | 18/77 [00:24<01:14,  1.26s/it]

tweet does not have attributes
tweet does not have attributes


 26%|██▌       | 20/77 [00:26<01:12,  1.28s/it]

tweet does not have attributes
tweet does not have attributes


 27%|██▋       | 21/77 [00:27<01:07,  1.20s/it]

tweet does not have attributes
tweet does not have attributes


 29%|██▊       | 22/77 [00:28<01:03,  1.16s/it]

tweet does not have attributes
tweet does not have attributes


 30%|██▉       | 23/77 [00:29<01:00,  1.12s/it]

tweet does not have attributes
tweet does not have attributes


 35%|███▌      | 27/77 [00:36<01:09,  1.38s/it]

tweet does not have attributes
tweet does not have attributes


 64%|██████▎   | 49/77 [01:09<00:44,  1.60s/it]

tweet does not have attributes
tweet does not have attributes


 66%|██████▌   | 51/77 [01:13<00:43,  1.67s/it]

tweet does not have attributes
tweet does not have attributes


 70%|███████   | 54/77 [01:17<00:34,  1.49s/it]

tweet does not have attributes
tweet does not have attributes


 77%|███████▋  | 59/77 [01:24<00:23,  1.31s/it]

tweet does not have attributes
tweet does not have attributes


 78%|███████▊  | 60/77 [01:25<00:21,  1.24s/it]

tweet does not have attributes
tweet does not have attributes


 82%|████████▏ | 63/77 [01:29<00:19,  1.42s/it]

processed 100 tweets


100%|██████████| 77/77 [01:50<00:00,  1.43s/it]


    Brand                      Created_at  Likes  Comments  Reposts  \
0     BMW  Fri Mar 03 10:00:02 +0800 2023     87        18       19   
1     BMW  Wed Mar 02 15:33:05 +0800 2022    111        14       36   
2     BMW  Wed Jan 11 02:06:25 +0800 2023    163        55       74   
3     BMW  Sat Jan 14 14:40:35 +0800 2023     23         6       15   
4     BMW  Mon Feb 27 10:04:43 +0800 2023   1275        64      157   
..    ...                             ...    ...       ...      ...   
118   BMW  Sat Dec 31 21:38:31 +0800 2022  11148       475       28   
119   BMW  Mon Dec 19 10:00:03 +0800 2022   7516       248      472   
120   BMW  Mon Dec 19 10:10:02 +0800 2022     47         9       15   
121   BMW  Mon Feb 20 11:20:03 +0800 2023   1031       139      604   
122   BMW  Sun Feb 19 10:00:04 +0800 2023     69         7      190   

                                               Content  \
0    <a  href="https://m.weibo.cn/search?containeri...   
1    <a  href="https://m.weibo.