In [2]:
# 載入檔案

import pandas as pd
import numpy as np

df = pd.read_csv('news/2330.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df.dropna(axis=0, how='any', inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,time,context
0,2023-11-24 13:36,美股休市，台股今（24日）開高走低，終場小跌7點來到17287.42點，跌幅0.04％，成交...
1,2023-11-23 13:43,美國股市周三（22日）收漲，但台股繼昨日大跌後，今（23日）開低走低，終場小跌16點，173...
2,2023-11-23 09:02,美股周三（22日）收漲，台股今（23）日以下跌18.93點、17291.33點開出，跌勢收斂...
3,2023-11-23 02:58,很多人都說在「護國神山」台積電上班很操、很累，不過一名在台積電工作2年的員工發文透露，他這2...
4,2023-11-22 13:37,由於美國股市表現不佳，台股也受到拖累，今（22日）台股開低走低，終場大跌106點來到1731...
5,2023-11-22 09:03,美股周二（21日）收黑，台股今（22）日以下跌91.10點、17325.60點開出，指數拉回...
6,2023-11-21 17:32,今股匯雙漲，外資回頭搶買台股不手軟！經統計今天外資再度大動作買超台股421億元，排名史上第1...
7,2023-11-21 14:29,美系外資搶在NVIDIA於美國時間21日盤後公布財報前出具有關AI的最新報告，預期對中國的出...
8,2023-11-21 13:38,受到美國股市上揚表現激勵，台股也有不錯表現，今（21日）台股開高走高，終場大漲206點來到1...
9,2023-11-21 09:02,美股收漲，台股今（21）日以上漲29.47點、17239.94點開出，漲勢擴大，指數上漲逾百...


In [3]:
# 句子分割

import jieba.analyse

with open('stopwords.txt') as f:
    stopwords = "".join([ l.rstrip() for l in f ])

def jiebaL(text:str, stopwords=stopwords):
    newTxt=''
    # 停用字
    for word in text:
        if word not in stopwords:
            newTxt += word
        if word in stopwords:
            newTxt += ' '
    # 結巴分詞
    word_seg = jieba.lcut(newTxt)

    return word_seg

In [5]:
# 情緒分析

# 來源:https://github.com/sweslo17/chinese_sentiment/blob/master/dict/ntusd-positive.txt
postive = [ line.strip() for line in open('ntusd-positive.txt','r', encoding='utf-8').readlines() ]
negative = [ line.strip() for line in open('ntusd-negative.txt', 'r', encoding='utf-8').readlines() ]

def sentiment(words:list, postive=postive, negative=negative):
    wordLen = len(words)
    pos, neg = 0, 0
    for word in words:
        if word in postive:
            pos += 1
        if word in negative:
            neg += 1

    return (pos, neg, wordLen)

In [6]:
def Run(df):
    sentiList = []
    date = df['time']
    for i in range(len(df)):
        words = jiebaL(df['context'][i])
        for i in range(len(words)-1, -1, -1):
            if words[i] == ' ':
                words.pop(i)
        sentiList.append(sentiment(words))
    df = pd.DataFrame(sentiList, columns = ['positive', 'negative', 'totalCount'], index=date)
    
    return df

In [7]:
senti_df = Run(df)
print(senti_df)

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\user01\AppData\Local\Temp\jieba.cache
Loading model cost 0.772 seconds.
Prefix dict has been built successfully.


                  positive  negative  totalCount
time                                            
2023-11-24 13:36         0         1          67
2023-11-23 13:43         0         0          78
2023-11-23 09:02         0         4          83
2023-11-23 02:58        11         3         282
2023-11-22 13:37         1         3          92
2023-11-22 09:03         0         7          93
2023-11-21 17:32         0         0         126
2023-11-21 14:29        13         2         238
2023-11-21 13:38         2         0          77
2023-11-21 09:02         2         0          96
2023-11-20 13:34         1         0          72
2023-11-20 09:02        10         4         278
2023-11-18 11:14         2         4         239
2023-11-17 13:45         0         0          76
2023-11-17 09:04         0         6          81
2023-11-16 13:38         2         1          98
2023-11-16 09:48        12         6         410
2023-11-16 09:02         0         1          82
2023-11-15 16:32    

In [27]:
import time

senti_df.index = pd.to_datetime(df.index)
# 設定指定時間範圍
start_time = pd.to_datetime('13:30')
end_time = pd.to_datetime('13:30')

# 創建一個新的DataFrame來存放結果
result_df = pd.DataFrame(columns=['positive', 'negative', 'totalCount'])

# 遍歷每一天的日期
for date in pd.date_range(start=senti_df.index.min().date(), end=senti_df.index.max().date()):
    # 篩選出當天和隔天的數據
    daily_data = senti_df[(senti_df.index.date >= pd.Timestamp(date).date()) & (senti_df.index.date <= (pd.Timestamp(date) + pd.DateOffset(days=1)).date())]
    
    # 篩選出指定時間範圍的數據
    filtered_data = daily_data[(daily_data.index >= pd.Timestamp(date) + pd.to_timedelta(start_time.time().strftime('%H:%M:%S'))) | (daily_data.index.date > pd.Timestamp(date).date())]
    filtered_data = filtered_data[filtered_data.index < pd.Timestamp(date) + pd.DateOffset(days=1) + pd.to_timedelta(end_time.time().strftime('%H:%M:%S'))]

    # 計算指數總和
    total_positive = filtered_data['positive'].sum()
    total_negative = filtered_data['negative'].sum()
    total_count = filtered_data['totalCount'].sum()

    # 將結果添加到結果DataFrame
    result_df = pd.concat([result_df, pd.DataFrame({'positive': [total_positive], 'negative': [total_negative], 'totalCount': [total_count]})], ignore_index=True)

# 設置日期作為索引
result_df.index = pd.date_range(start=senti_df.index.min().date(), end=senti_df.index.max().date())

print(result_df)

           positive negative totalCount
2023-11-09       71       40       1923
2023-11-10        8        4        395
2023-11-11        0        0          0
2023-11-12       41       18       1486
2023-11-13        1        0         77
2023-11-14        6        7        380
2023-11-15       33       18       1197
2023-11-16        2        7        179
2023-11-17        2        4        315
2023-11-18        0        0          0
2023-11-19       10        4        278
2023-11-20        3        0        168
2023-11-21       15        9        534
2023-11-22       12       10        457
2023-11-23        0        0         78
2023-11-24        0        1         67
