In [3]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# === 設定資料夾 ===
data_folder = "data/tariff_data_en"  # <-- 改成你的資料夾名稱
analyzer = SentimentIntensityAnalyzer()
sentiment_map = {"negative": 0, "neutral": 1, "positive": 2}

# === 定義特徵萃取函數 ===
def extract_day_features(df, date_str):
    df = df.copy()
    df["sentiment_score"] = df["sentiment"].map(sentiment_map)
    df["length"] = df["Tweet Content"].astype(str).apply(len)
    df["num_words"] = df["Tweet Content"].astype(str).apply(lambda x: len(x.split()))
    df["num_hashtags"] = df["Tweet Content"].astype(str).str.count("#")
    df["num_mentions"] = df["Tweet Content"].astype(str).str.count("@")
    df["uppercase_ratio"] = df["Tweet Content"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df["vader_compound"] = df["Tweet Content"].astype(str).apply(
        lambda x: analyzer.polarity_scores(x)["compound"])

    total = len(df)
    return {
        "date": date_str,
        "num_tweets": total,
        "avg_length": df["length"].mean(),
        "avg_num_words": df["num_words"].mean(),
        "avg_hashtags": df["num_hashtags"].mean(),
        "avg_mentions": df["num_mentions"].mean(),
        "avg_uppercase_ratio": df["uppercase_ratio"].mean(),
        "prop_negative": len(df[df["sentiment"] == "negative"]) / total if total else 0,
        "prop_neutral": len(df[df["sentiment"] == "neutral"]) / total if total else 0,
        "prop_positive": len(df[df["sentiment"] == "positive"]) / total if total else 0,
        "vader_avg_compound": df["vader_compound"].mean(),
        "vader_std_compound": df["vader_compound"].std(),
        "vader_max_compound": df["vader_compound"].max(),
        "vader_min_compound": df["vader_compound"].min(),
        "avg_sentiment_score": df["sentiment_score"].mean(),
        "day_of_week": datetime.strptime(date_str, "%Y-%m-%d").weekday(),
        "is_weekend": 1 if datetime.strptime(date_str, "%Y-%m-%d").weekday() >= 5 else 0
    }

# === 讀取所有 CSV 並處理 ===
feature_rows = []
for filename in sorted(os.listdir(data_folder)):
    if filename.endswith(".csv"):
        filepath = os.path.join(data_folder, filename)
        try:
            df = pd.read_csv(filepath)
            # 推論日期
            date_str = filename.replace("tariff_data_", "").replace(".csv", "")
            if "-" not in date_str:  # 像 20250403
                date_str = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
            features = extract_day_features(df, date_str)
            feature_rows.append(features)
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")

# === 建立結果 DataFrame ===
features_df = pd.DataFrame(feature_rows)
print("✅ 萃取完成，以下是特徵預覽：")
print(features_df.head())

# === 可選：儲存為 CSV ===
features_df.to_csv("daily_sentiment_features.csv", index=False)


✅ 萃取完成，以下是特徵預覽：
         date  num_tweets  avg_length  avg_num_words  avg_hashtags  \
0  2025-03-01          57  186.807018      27.807018      0.526316   
1  2025-03-02         150  183.986667      27.406667      0.386667   
2  2025-03-03         205  163.819512      25.600000      0.165854   
3  2025-03-04         215  178.246512      27.265116      0.344186   
4  2025-03-05         216  172.273148      26.629630      0.435185   

   avg_mentions  avg_uppercase_ratio  prop_negative  prop_neutral  \
0      0.157895             0.061645       0.614035      0.157895   
1      0.153333             0.072549       0.546667      0.173333   
2      0.112195             0.067358       0.478049      0.234146   
3      0.162791             0.066337       0.460465      0.167442   
4      0.236111             0.061758       0.393519      0.324074   

   prop_positive  vader_avg_compound  vader_std_compound  vader_max_compound  \
0       0.228070           -0.200100            0.484145            

In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# === 設定資料夾 ===
data_folder = "data/tariff_data_en_test"  # <-- 改成你的資料夾名稱
analyzer = SentimentIntensityAnalyzer()
sentiment_map = {"negative": 0, "neutral": 1, "positive": 2}

# === 定義特徵萃取函數 ===
def extract_day_features(df, date_str):
    df = df.copy()
    df["sentiment_score"] = df["sentiment"].map(sentiment_map)
    df["length"] = df["Tweet Content"].astype(str).apply(len)
    df["num_words"] = df["Tweet Content"].astype(str).apply(lambda x: len(x.split()))
    df["num_hashtags"] = df["Tweet Content"].astype(str).str.count("#")
    df["num_mentions"] = df["Tweet Content"].astype(str).str.count("@")
    df["uppercase_ratio"] = df["Tweet Content"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df["vader_compound"] = df["Tweet Content"].astype(str).apply(
        lambda x: analyzer.polarity_scores(x)["compound"])

    total = len(df)
    return {
        "date": date_str,
        "num_tweets": total,
        "avg_length": df["length"].mean(),
        "avg_num_words": df["num_words"].mean(),
        "avg_hashtags": df["num_hashtags"].mean(),
        "avg_mentions": df["num_mentions"].mean(),
        "avg_uppercase_ratio": df["uppercase_ratio"].mean(),
        "prop_negative": len(df[df["sentiment"] == "negative"]) / total if total else 0,
        "prop_neutral": len(df[df["sentiment"] == "neutral"]) / total if total else 0,
        "prop_positive": len(df[df["sentiment"] == "positive"]) / total if total else 0,
        "vader_avg_compound": df["vader_compound"].mean(),
        "vader_std_compound": df["vader_compound"].std(),
        "vader_max_compound": df["vader_compound"].max(),
        "vader_min_compound": df["vader_compound"].min(),
        "avg_sentiment_score": df["sentiment_score"].mean(),
        "day_of_week": datetime.strptime(date_str, "%Y-%m-%d").weekday(),
        "is_weekend": 1 if datetime.strptime(date_str, "%Y-%m-%d").weekday() >= 5 else 0
    }

# === 讀取所有 CSV 並處理 ===
feature_rows = []
for filename in sorted(os.listdir(data_folder)):
    if filename.endswith(".csv"):
        filepath = os.path.join(data_folder, filename)
        try:
            df = pd.read_csv(filepath)
            # 推論日期
            date_str = filename.replace("tariff_data_", "").replace(".csv", "")
            if "-" not in date_str:  # 像 20250403
                date_str = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
            features = extract_day_features(df, date_str)
            feature_rows.append(features)
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")

# === 建立結果 DataFrame ===
features_df = pd.DataFrame(feature_rows)
print("✅ 萃取完成，以下是特徵預覽：")
print(features_df.head())

# === 可選：儲存為 CSV ===
features_df.to_csv("daily_sentiment_features_test.csv", index=False)


✅ 萃取完成，以下是特徵預覽：
         date  num_tweets  avg_length  avg_num_words  avg_hashtags  \
0  2025-05-01         136  174.205882      27.169118      0.308824   
1  2025-05-02          57  173.929825      26.438596      0.368421   
2  2025-05-03         140  179.421429      27.014286      0.428571   
3  2025-05-04         148  143.945946      24.182432      0.148649   
4  2025-05-05         137  160.832117      24.364964      0.357664   

   avg_mentions  avg_uppercase_ratio  prop_negative  prop_neutral  \
0      0.250000             0.072426       0.448529      0.205882   
1      0.210526             0.066371       0.350877      0.210526   
2      0.128571             0.048738       0.471429      0.200000   
3      0.087838             0.068771       0.324324      0.317568   
4      0.175182             0.058099       0.357664      0.306569   

   prop_positive  vader_avg_compound  vader_std_compound  vader_max_compound  \
0       0.345588           -0.056668            0.463832            

In [4]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# === 設定資料夾 ===
data_folder = "data/cross_val"  # <-- 改成你的資料夾名稱
analyzer = SentimentIntensityAnalyzer()
sentiment_map = {"negative": 0, "neutral": 1, "positive": 2}

# === 定義特徵萃取函數 ===
def extract_day_features(df, date_str):
    df = df.copy()
    df["sentiment_score"] = df["sentiment"].map(sentiment_map)
    df["length"] = df["Tweet Content"].astype(str).apply(len)
    df["num_words"] = df["Tweet Content"].astype(str).apply(lambda x: len(x.split()))
    df["num_hashtags"] = df["Tweet Content"].astype(str).str.count("#")
    df["num_mentions"] = df["Tweet Content"].astype(str).str.count("@")
    df["uppercase_ratio"] = df["Tweet Content"].astype(str).apply(
        lambda x: sum(1 for c in x if c.isupper()) / len(x) if len(x) > 0 else 0)
    df["vader_compound"] = df["Tweet Content"].astype(str).apply(
        lambda x: analyzer.polarity_scores(x)["compound"])

    total = len(df)
    return {
        "date": date_str,
        "num_tweets": total,
        "avg_length": df["length"].mean(),
        "avg_num_words": df["num_words"].mean(),
        "avg_hashtags": df["num_hashtags"].mean(),
        "avg_mentions": df["num_mentions"].mean(),
        "avg_uppercase_ratio": df["uppercase_ratio"].mean(),
        "prop_negative": len(df[df["sentiment"] == "negative"]) / total if total else 0,
        "prop_neutral": len(df[df["sentiment"] == "neutral"]) / total if total else 0,
        "prop_positive": len(df[df["sentiment"] == "positive"]) / total if total else 0,
        "vader_avg_compound": df["vader_compound"].mean(),
        "vader_std_compound": df["vader_compound"].std(),
        "vader_max_compound": df["vader_compound"].max(),
        "vader_min_compound": df["vader_compound"].min(),
        "avg_sentiment_score": df["sentiment_score"].mean(),
        "day_of_week": datetime.strptime(date_str, "%Y-%m-%d").weekday(),
        "is_weekend": 1 if datetime.strptime(date_str, "%Y-%m-%d").weekday() >= 5 else 0
    }

# === 讀取所有 CSV 並處理 ===
feature_rows = []
for filename in sorted(os.listdir(data_folder)):
    if filename.endswith(".csv"):
        filepath = os.path.join(data_folder, filename)
        try:
            df = pd.read_csv(filepath)
            # 推論日期
            date_str = filename.replace("tariff_data_", "").replace(".csv", "")
            if "-" not in date_str:  # 像 20250403
                date_str = f"{date_str[:4]}-{date_str[4:6]}-{date_str[6:]}"
            features = extract_day_features(df, date_str)
            feature_rows.append(features)
        except Exception as e:
            print(f"❌ Error reading {filename}: {e}")

# === 建立結果 DataFrame ===
features_df = pd.DataFrame(feature_rows)
print("✅ 萃取完成，以下是特徵預覽：")
print(features_df.head())

# === 可選：儲存為 CSV ===
features_df.to_csv("cross_val.csv", index=False)


✅ 萃取完成，以下是特徵預覽：
         date  num_tweets  avg_length  avg_num_words  avg_hashtags  \
0  2018-07-01         129  201.410853      24.403101      0.666667   
1  2018-07-02         139  187.489209      23.158273      0.611511   
2  2018-07-03         144  189.986111      22.875000      0.958333   
3  2018-07-04         140  180.064286      22.514286      0.842857   
4  2018-07-05         140  169.135714      20.928571      0.850000   

   avg_mentions  avg_uppercase_ratio  prop_negative  prop_neutral  \
0      0.240310             0.052229       0.426357      0.193798   
1      0.381295             0.059080       0.474820      0.251799   
2      0.409722             0.067989       0.416667      0.312500   
3      0.271429             0.069381       0.521429      0.192857   
4      0.207143             0.063240       0.528571      0.214286   

   prop_positive  vader_avg_compound  vader_std_compound  vader_max_compound  \
0       0.379845           -0.011481            0.497687            