In [100]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [88]:
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')

## Preprocess

In [89]:
def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    
    title = soup.body.h1.string.strip().lower()

    
    article_info = soup.head.find('div', {'class': 'article-info'})
    author_name = article_info.find('span', {'class': 'author_name'})
    if author_name is not None:
        author = author_name.get_text()
    elif article_info.span is not None:
        author = article_info.span.string
    else:
        author = article_info.a.string

    
    author = re.sub(r'\s+', '_', author.strip().lower().replace(' and ', ' & '))
    if author.startswith('by_'):
        author = author[3:]

    
    a_list = soup.body.find('footer', {'class': 'article-topics'}).find_all('a')
    topic_list = [a.string.strip().lower() for a in a_list]
    topic = ' '.join([re.sub(r'\s+', '_', t) for t in topic_list])

    
    try:
        date_time = article_info.time['datetime']
    except:
        date_time = 'Wed, 19 Oct 2024 15:00:00'
        
    day_map = {'mon': 1, 'tue': 2, 'wed': 3,
           'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7}
    month_map = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
             'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
    
    match_obj = re.search(r'([\w]+),\s+([\d]+)\s+([\w]+)\s+([\d]+)\s+([\d]+):([\d]+):([\d]+)', date_time)

    day , date , month , hour = day_map[match_obj.group(1).lower()],  int(match_obj.group(2)) , month_map[match_obj.group(3).lower()], int(match_obj.group(5))
    # find content
    content = soup.body.find('section', {'class': 'article-content'}).get_text()
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, content)
    content = re.sub(r, '', content)
    content = re.sub('[\W]+', ' ', content.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    content_len = len(content)
    
    return title, author, topic, day, date ,month, hour, content_len

In [90]:
feature_list_train = []
feature_list_test = []

for text in df_train['Page content']:
    feature_list_train.append(preprocessor(text))

for text in df_test['Page content']:
    feature_list_test.append(preprocessor(text))

df_parse_train = pd.DataFrame(
    feature_list_train,
    columns=['Title', 'Author', 'Topic', 'Day', 'Date', 'Month',
             'Hour', 'content length']
)
df_parse_test = pd.DataFrame(
    feature_list_test,
    columns=['Title', 'Author', 'Topic', 'Day', 'Date', 'Month',
             'Hour', 'content length']
)
columns_to_convert = ['Day', 'Date', 'Month', 'Hour', 'content length']
df_parse_train[columns_to_convert] = df_parse_train[columns_to_convert].apply(lambda x: x.astype(np.float32))
df_parse_test[columns_to_convert] = df_parse_test[columns_to_convert].apply(lambda x: x.astype(np.float32))

## BOW

In [91]:
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer_stem_nostop(text):
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]

count = CountVectorizer(ngram_range=(1, 1) , tokenizer=tokenizer_stem_nostop, lowercase=False)
trans = ColumnTransformer([
        ('Title', count, 'Title'),
        ('Author',count, 'Author'),
        ('Topic', count, 'Topic')],
        n_jobs = -1,
        remainder='passthrough'
    )

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [92]:
X_train_raw = pd.DataFrame(df_parse_train , columns = df_parse_train.columns)
y_train_raw = (df_train['Popularity'].values == 1).astype(int)
X_test = pd.DataFrame(df_parse_test , columns = df_parse_test.columns)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_raw, y_train_raw, test_size = 0.2, random_state = 0)

In [114]:
lgbm = Pipeline([('ColumnTrans', trans),
                 ('Classifier', LGBMClassifier(random_state = 0, learning_rate = 0.007, n_estimators = 400))])
forest = Pipeline([('ColumnTrans', trans),
                   ('Classifier', RandomForestClassifier(n_jobs = -1, random_state = 0, n_estimators = 300))])

In [115]:
voting = VotingClassifier([('lgbm', lgbm), ('forest', forest)],
                          voting='soft', weights=[1, 0.25])
voting.fit(X_train , y_train)
print('train auc: %.5f' % roc_auc_score(y_train, voting.predict_proba(X_train)[:, 1]))
print('valid auc: %.5f' % roc_auc_score(y_valid, voting.predict_proba(X_valid)[:, 1]))

[LightGBM] [Info] Number of positive: 10885, number of negative: 11229
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.047744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4618
[LightGBM] [Info] Number of data points in the train set: 22114, number of used features: 2021
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492222 -> initscore=-0.031114
[LightGBM] [Info] Start training from score -0.031114
train auc: 0.95906
valid auc: 0.58676


In [116]:
y_score = voting.predict_proba(X_test)[:, 1]
df_pred = pd.DataFrame({'Id': df_test['Id'], 'Popularity': y_score})
df_pred.to_csv('test_pred.csv', index = False)

## Report

**Student ID**: 113062657  
**Name**: 黃盛揚

---

### 1. Data Preprocessing


在Preprocess中，我使用 **BeautifulSoup** 來拆解每篇文章的 HTML 結構，提取了 **Title**、**Author**、**Topic**、**Date** 和 **Content Length** 等欄位。原本曾考慮使用**Content**作為特徵，但因為記憶體使用過高且預測效果不佳，最終僅提取 **Content Length** 作為特徵，以減少記憶體消耗並提升模型效率。對於文字型欄位（**Title**、**Author** 和 **Topic**），我使用了 **Bag-of-Words (BOW)** 模型來將其轉換為數值特徵。使用 **CountVectorizer** 進行特徵向量化，並搭配自訂的詞幹分析器（stemming）來減少詞彙維度，過濾掉 **stopwords**，以強調主要資訊並降低模型的負擔。在資料分割方面，我將訓練資料集分為訓練集與驗證集，比例為 80:20。

### 2. Model Building

為了提升預測準確性與穩定性，我選擇了 **Voting Classifier** 作為主模型，並結合了 **LightGBM** 和 **Random Forest** 的優勢：
- **LightGBM** 有高效能和處理大量數據的優點。
- **Random Forest** 比起前者增加了模型的多樣性與穩定性。

最後對 Voting Classifier 使用「soft voting」策略，將各模型的預測機率進行加權平均，以達到更精確的分類結果。
模型的訓練過程中，我首先使用 **Pipeline** 將特徵轉換與分類模型整合，並且對 **LightGBM** 使用低learning rate和適量的n_estimators來控制模型的學習深度，確保模型訓練的穩定性。對於 **Random Forest**，則設置較高的樹數以增強模型穩定性，並減少overfittingS的風險。

### 3. Takeaways

- 使用簡化特徵（例如 **Content Length** 取代全文內容）大幅減少了記憶體需求，且能維持合理的分類表現。
- Voting Classifier 組合了 **LightGBM** 和 **Random Forest** 的優勢，提供更快的預測速度和更高的穩定性。
- 記憶體的限制使得無法處理完整的文章文本，突顯了特徵選擇與簡化的重要性。
- 不同模型的組合與權重選擇需多次測試才能達到最優結果。

### Lessons Learned
preprocess很影響這次競賽的結果，在挑選feature的時候也遇到了不少的問題，原本以為很重要的content不僅很難處理，score還特別差，但我認為更重要的還是運氣。
