# 載入相關套件 

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import csv
import datetime
import torch
import monpa
from monpa import utils
from collections import Counter
import re

+---------------------------------------------------------------------+
  Welcome to MONPA: Multi-Objective NER POS Annotator for Chinese
+---------------------------------------------------------------------+
已找到 model檔。Found model file.


* monpa切詞，可以使toarch gpu


In [3]:
torch.cuda.is_available()

True

In [4]:
def print_gpu_memory():
    total_memory = torch.cuda.get_device_properties(0).total_memory/1024/1024/1024
    reversed_memory = torch.cuda.memory_reserved(0)/1024/1024
    allocated_memory = torch.cuda.memory_allocated(0)/1024/1024
    free_memory = reversed_memory - allocated_memory/1024/1024
    print(f"有沒有 GPU 可以使用：{torch.cuda.is_available()}\nGPU 所有記憶體: {total_memory} MB\n預先保留作為執行程序使用的記憶體: {reversed_memory} MB\n程式執行使用的記憶體: {allocated_memory} MB\n預先保留的記憶體剩下空間: {free_memory} MB")
    
print_gpu_memory()

有沒有 GPU 可以使用：True
GPU 所有記憶體: 7.99951171875 MB
預先保留作為執行程序使用的記憶體: 0.0 MB
程式執行使用的記憶體: 0.0 MB
預先保留的記憶體剩下空間: 0.0 MB


In [5]:
#定義相關要使用的文件、欄位
news_list = [('bda2023_mid_news_2022-2023.csv' ,'bda2023_mid_news_2021.csv','bda2023_mid_news_2020.csv'),
             ('bda2023_mid_bbs_2022-2023.csv'  ,'bda2023_mid_bbs_2019-2021.csv'),
             ('bda2023_mid_forum_2020.csv','bda2023_mid_forum_2021.csv','bda2023_mid_forum_2022-2023.csv')]
news_name = ['mid_news','mid_bbs','mid_forum']

usecols = ['post_time', 'title', 'content']

In [6]:
#讀取stopword_zh
with open('./input/stopwords_zh.txt', 'r', encoding='utf-8') as file:
    stopwords = [line.strip() for line in file]

In [10]:
#消除停用辭
def clearSentence(sentence, stopwords):
    for word in stopwords:
        sentence = sentence.replace(word, '')
    return sentence

In [11]:
#由word2vec所產生與陽明最相近的
#7個關鍵詞，做為文章的篩選

keywords = ['貨船','船隻','船舶','雜貨輪','貨櫃船','陽明','海運']
pattern = '|'.join(keywords)

In [12]:
def word_tokenization(news_data):
    print("開始時間:",datetime.datetime.now())
    
    #切斷句,跟所有文字
    #先做斷句存成二維陣列
    news_data["chunk"] = news_data["content"].map(utils.short_sentence)
    
    #clearstence
    news_data["chunk"] = news_data["chunk"].apply(lambda x: [clearSentence(y, stopwords) for y in x])
    
    #新增chunk_list
    chunk_list = []
    
    news_data["chunk"].map(chunk_list.extend)
    news_data["chunk_len"] = news_data["chunk"].map(len)
    
    #批次處理500行
    cut_batch_list = []
    process_num = 500
    for item_idx in range(0, len(chunk_list), process_num):
        if item_idx < len(chunk_list):
            print(item_idx+process_num)
            cut_batch_list.extend(monpa.cut_batch(chunk_list[item_idx:item_idx+process_num]))
            
        else:
            cut_batch_list.extend(monpa.cut_batch(chunk_list[item_idx:]))
    
    #把切好的辭放回news_data的dataframe
    cut_batch = []
    for i in news_data["chunk_len"].tolist():
        cut_batch.append(cut_batch_list[:i])
        cut_batch_list = cut_batch_list[i:]
    news_data["cut_batch_c"] = cut_batch 
    
    #把每個辭轉換為一個str由" "坐間隔
    new_list = []
    for lst in news_data["cut_batch_c"]:
        new_str = ""
        for sublst in lst:
            for term in sublst:
                term=term.strip() #去除前後多餘空白
                if(len(term)>1): 
                    new_str = new_str+' '+term
        new_list.append(new_str)
            
       
    news_data["content"] = new_list
    news_data = news_data.drop(columns=['title','chunk','chunk_len','cut_batch_c'])
    print("結束時間:",datetime.datetime.now())
    
    return news_data

# 切辭

In [13]:
%%time
## monpa.cut_batch (推薦) 批次切

#使用gpu 來切辭
monpa.use_gpu(True)
# 讀取並連接所有文件
for new_list,name in zip(news_list,news_name):
    
    # 初始化一個空的 DataFrame 作為連接news
    news_data = pd.DataFrame()
    for file_name in new_list:
        print('file',file_name)
        temp_df = pd.read_csv('./input/'+file_name,encoding='utf-8-sig',usecols = usecols)
        news_data = pd.concat([news_data, temp_df], ignore_index=True)   
    
    #只留年月日
    news_data['post_time'] = pd.to_datetime(news_data['post_time'])
    news_data.sort_values(by='post_time', inplace = True)
    news_data['post_time'] = pd.to_datetime(news_data['post_time']).dt.date
    
    #空值補 ""空字串 
    news_data['title'] = news_data['title'].fillna('')
    news_data['content'] = news_data['content'].fillna('')
    
    #合併title跟content唯一欄
    news_data['content'] = news_data['title'] +" "+ news_data['content']
    news_data = news_data[news_data['content'].str.contains(pattern)].reset_index(drop = True)
    print(news_data.info())
    news_data = word_tokenization(news_data)
    

    # 將 DataFrame 保存為 CSV 檔案
    news_data.to_csv(f'./output/{name}_OK.csv', index=False, encoding='utf-8-sig')

file bda2023_mid_news_2022-2023.csv
file bda2023_mid_news_2021.csv
file bda2023_mid_news_2020.csv
<class 'pandas.core.frame.DataFrame'>
Index: 631837 entries, 474792 to 293953
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   post_time  631837 non-null  object
 1   title      631837 non-null  object
 2   content    631837 non-null  object
dtypes: object(3)
memory usage: 19.3+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15424 entries, 0 to 15423
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   post_time  15424 non-null  object
 1   title      15424 non-null  object
 2   content    15424 non-null  object
dtypes: object(3)
memory usage: 361.6+ KB
None
開始時間: 2023-04-20 01:11:04.460574
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
155