# DataLab Cup 1: Text Feature Engineering

Team name: IDUDL

Team members: 110062619 楊淨富 112062611 吳明真 112138502 陳炫妙

## Load datasets

In [3]:
import pandas as pd

# Dataset format
# train.csv contains 27643 data points (news articles) with the attributes 'Id', 'Page content', and 'Popularity'
# test.csv contains 11847 data points with the attributes 'Id' and 'Page content'

train  = pd.read_csv('./dataset/train.csv')
test  = pd.read_csv('./dataset/test.csv')

## Text Preprocessing

### Feature Engineering

Beautiful Soup是一個 Python library，用於從HTML或XML文檔中抽取data。它透過提取tag、屬性和檔案內容，以便進一步分析或儲存資料。  
以下為抽取出的features。  

* author:文章作者(去掉By/by)
* time:可再細分成Year/Month/Day/Hour/Minute/Second
* weekday:星期幾
* num_image:文章中圖片的數量
* num_link:文章中超連結的數量
* len_title:標題的長度(有幾個英文單字)
* len_article:文章的長度(有幾個英文單字)
* delta_days:抓取發文日期距離年底的天數
* article:文章的實際內容

In [4]:
import re
from bs4 import BeautifulSoup
from datetime import datetime

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')

    # Find the author
    article_info = soup.find('div', {'class': 'article-info'})
    author_name = soup.find(attrs={'class': 'author_name'})
    if author_name:
        author = author_name.get_text().replace('By', '').replace('by', '').strip()
    elif article_info.span:
        author = article_info.span.string
    else:
        author = article_info.a.string

    author = author.lower()

    # Find weekday
    date_time = soup.find('time').get_text()
    if date_time != None:
        date = pd.Timestamp(date_time.split(' ')[0])
        if pd.notna(date_time):
            weekday = date.dayofweek # 0: Monday, 1:Tuesday, ...
        else:
            weekday = 7
    else:
        weekday = 7 # No publish date
        date_time = '2013-06-19 15:04:30 UTC'

    # Define a regular expression pattern to match the updated date and time format
    date_pattern = r'((\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) UTC)'

    # Use re.search to find and capture the date and time
    match_obj = re.search(date_pattern, date_time)
    if match_obj:
        year, month, day, hour, minute, second = match_obj.groups()[1:7]
    else:
        year, month, day, hour, minute, second = 2013,6,19,15,4,30

    # Find the topic
    topic_element = soup.find(attrs={'class': 'article-topics'})
    topic = topic_element.get_text().replace('Topics', '').replace(':', '').replace(',', '').strip().lower() if topic_element else ''

    # Find the content
    content = soup.find('section', {'class': 'article-content'}).get_text().lower()
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, content)
    text = re.sub(r, '', content)
    article=[]
    # replace('-','') removes nose of emoticons
    content = re.sub('[\W]+', ' ', content) + ' ' + ' '.join(emoticons).replace('-','')
    # Output the text
    article.append(content.strip())
    article_str = ''.join(article)

    # Find the title
    title = soup.body.h1.string.strip().lower()

    # Find the # of images
    num_image = len(soup.body.find_all('img'))

    # Find the # of links
    num_link = len(soup.body.find_all('a'))

    # Find the len of title
    len_title = len(soup.find('h1').get_text().split())

    # Find the Len of article
    len_article = len(soup.body.find('section', {'class': 'article-content'}).get_text().split())

    # Fine the delta days to the last day of the year
    time_str = soup.find('time').get_text()
    if time_str == '':
        delta = 0
    else:
        time_arr = time_str.split(' ')

        year = time_arr[0].split('-')[0]
        target_str = year + '-12-31 23:59:59'
        target = datetime.strptime(target_str, '%Y-%m-%d %H:%M:%S')
        day_format = datetime.strptime(time_arr[0] + ' ' + time_arr[1], '%Y-%m-%d %H:%M:%S')
        delta = (target - day_format).days

    return author, year, month, day, weekday, hour, minute, second, topic, \
            title, num_image, num_link, len_title, len_article, delta, article_str

feature_list = []
for text in train['Page content']:
    feature_list.append(preprocessor(text))

for text in test['Page content']:
    feature_list.append(preprocessor(text))

df_extract = pd.DataFrame(
    feature_list,
    columns=['Author', 'Year', 'Month', 'Day', 'Weekday','Hour', 'Minute', 'Second', 'Topic',\
             'Title', 'Num image', 'Num link', 'Len title', 'Len article', 'Delta', 'Content']
)

In [5]:
df_extract.head()

Unnamed: 0,Author,Year,Month,Day,Weekday,Hour,Minute,Second,Topic,Title,Num image,Num link,Len title,Len article,Delta,Content
0,clara moskowitz,2013,6,19,2.0,15,4,30,asteroid asteroids challenge earth space u.s. ...,nasa's grand challenge: stop asteroids from de...,1,21,8,577,195,there may be killer asteroids headed for earth...
1,christina warren,2013,3,28,3.0,17,40,55,apps and software google open source opn pledg...,google's new open source patent pledge: we won...,1,16,12,305,278,google took a stand of sorts against patent la...
2,sam laird,2014,5,7,2.0,19,15,20,entertainment nfl nfl draft sports television,ballin': 2014 nfl draft picks get to choose th...,1,9,12,1114,238,you ve spend countless hours training to be an...
3,sam laird,2013,10,11,4.0,2,26,50,sports video videos watercooler,cameraperson fails deliver slapstick laughs,0,11,5,278,81,tired of the same old sports fails and news fa...
4,connor finnegan,2014,4,17,3.0,3,31,43,entertainment instagram instagram video nfl sp...,nfl star helps young fan prove friendship with...,51,14,10,1370,258,at 6 foot 5 and 298 pounds all pro nfl star j ...


###除掉太過細緻的分、秒

In [6]:
df_copy = df_extract.copy()
df_drop = df_copy.drop(columns=['Minute', 'Second'])

In [7]:
df_drop.head()

Unnamed: 0,Author,Year,Month,Day,Weekday,Hour,Topic,Title,Num image,Num link,Len title,Len article,Delta,Content
0,clara moskowitz,2013,6,19,2.0,15,asteroid asteroids challenge earth space u.s. ...,nasa's grand challenge: stop asteroids from de...,1,21,8,577,195,there may be killer asteroids headed for earth...
1,christina warren,2013,3,28,3.0,17,apps and software google open source opn pledg...,google's new open source patent pledge: we won...,1,16,12,305,278,google took a stand of sorts against patent la...
2,sam laird,2014,5,7,2.0,19,entertainment nfl nfl draft sports television,ballin': 2014 nfl draft picks get to choose th...,1,9,12,1114,238,you ve spend countless hours training to be an...
3,sam laird,2013,10,11,4.0,2,sports video videos watercooler,cameraperson fails deliver slapstick laughs,0,11,5,278,81,tired of the same old sports fails and news fa...
4,connor finnegan,2014,4,17,3.0,3,entertainment instagram instagram video nfl sp...,nfl star helps young fan prove friendship with...,51,14,10,1370,258,at 6 foot 5 and 298 pounds all pro nfl star j ...


## Tokenize = Stopwords removal + Word stemming

Tokenize的方式分為: Stopwords removal + Word stemming

* Stopwords removal:

  先將text用space切開後，並且先拿掉常見的單字(即stopwords.words('english'))，無非是因為在各種文件中，stopwords是常見的單字，包含很少有用的資訊，所以無法用來區分不同類別的文件。例如:"is", "and", "has", and "the"。移除stopwords在使用raw或正規化的單字（例如BoW和feature hashing）時可能很有用。

* Word stemming:

  將單字轉換為其原型(root)的過程，它使我們能夠將相關的單字map到相同的詞幹。我們是使用PorterStemmer，但也有嘗試過WordNetLemmatizer來執行詞形還原操作。Word stemming用以簡化文本處理、文本分析或NLP。它有助於減少單字變型，使得更容易分析。

In [18]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')
stop = stopwords.words('english')
stop.extend([',',':','.','(',')',"'",'nt','>','<','?','-','_','*','%',';','~','`','``','--','[',']','[]',"'s",'also','imag','courtesi'])

def tokenizer(text):
    if type(text) == np.ndarray:
        text = text[0]
    return re.split(r'\s+', text.strip())

def tokenizer_stem_nostop(text):
    porter= PorterStemmer()

    if type(text) == np.ndarray:
        text = text[0]
    text = re.sub(r"([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text) # it's -> its
    text = re.sub(r'\.', '', text)
    text = re.sub(r'[^\w]+', ' ', text)

    return [porter.stem(w) for w in re.split(r'\s+', text.strip()) \
           if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_lem(text):
    if type(text) == np.ndarray:
        text = text[0]
    text = re.sub(r"([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r'[^\w]+', ' ', text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w, pos="v") for w in re.split(r'\s+', text.strip())]

def tokenizer_stem_lemma_nostop(text):
    clean_tokens = []
    # Remove stopwords
    for token in text:
        if token not in stop:
            clean_tokens.append(token)

    # PorterStemmer
    porter = PorterStemmer()
    clean_tokens_porter = [porter.stem(w) for w in clean_tokens]

    # Lemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    clean_tokens_lemmatizer = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in clean_tokens_porter]

    return(clean_tokens_lemmatizer)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [9]:
# BoW (Bag-Of-Words)
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

trans_att = ColumnTransformer(
    [('Author', CountVectorizer(tokenizer=tokenizer, lowercase=False), [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [6]),
     ('Title', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [7])],
    n_jobs=-1,
    remainder='passthrough'
)

trans_tt = ColumnTransformer(
    [('Author', 'drop', [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [6]),
     ('Title', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [7])],
    n_jobs=-1,
    remainder='passthrough'
)

##TF-idf
此處對新聞標題和內文做 tf-idf，其中 tf-idf 裡的字詞抓取，先對出現在 stopwords 字典裡字詞做刪減，再以頻率最高的前 20 個詞計算其值，並當作變數來做分類

In [19]:
#content:原內容/content_sp:分出一個一個單字/content_tok:用wml+stop處理過/content_rep:content_tok連起來
df_drop["Content_sp"] = df_drop.apply(lambda row: word_tokenize(row['Content']), axis=1)
df_drop["Content_tok"]=df_drop.apply(lambda row: tokenizer_stem_lemma_nostop(row['Content_sp']), axis=1)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

def join_string(text):
    return(' '.join(text))

df_drop["Content_rep"]=df_drop.apply(lambda row: join_string(row["Content_tok"]), axis=1)
v = TfidfVectorizer(max_features=20,stop_words = stop)
x = v.fit_transform(df_drop["Content_rep"])
#取出transform好的矩陣們!也許count也可以照樣處理
dfc = pd.DataFrame(x.toarray(), columns=v.get_feature_names_out())
dfc.head(4)

Unnamed: 0,app,compani,first,game,get,go,like,make,mashabl,new,one,peopl,say,see,take,time,use,video,work,year
0,0.0,0.0,0.134245,0.0,0.122743,0.138518,0.116595,0.0,0.0,0.22925,0.109419,0.0,0.687474,0.330908,0.247022,0.117008,0.0,0.150083,0.436172,0.0
1,0.0,0.323454,0.284807,0.0,0.0,0.0,0.0,0.239269,0.0,0.486364,0.0,0.0,0.243084,0.175509,0.262034,0.248238,0.547984,0.0,0.0,0.0
2,0.0,0.0,0.209257,0.495127,0.637757,0.215917,0.060581,0.175798,0.080842,0.178673,0.113705,0.0,0.119068,0.085968,0.12835,0.30398,0.0,0.155962,0.0,0.130283
3,0.0,0.0,0.0,0.0,0.171793,0.0,0.0,0.0,0.0,0.0,0.0,0.202352,0.320733,0.115786,0.0,0.327532,0.0,0.840231,0.0,0.0


只留下經過實驗後，跟結果最重要的前四個詞


*   use, see, one, like

把多餘的文章內容去除



In [21]:
df_drop=pd.concat([df_drop, dfc[['use','see','one','like']]], axis=1)
df_drop = df_drop.drop(columns=['Content_sp', 'Content_tok', 'Content_rep', 'Content'])

In [22]:
df_drop.columns

Index(['Author', 'Year', 'Month', 'Day', 'Weekday', 'Hour', 'Topic', 'Title',
       'Num image', 'Num link', 'Len title', 'Len article', 'Delta', 'use',
       'see', 'one', 'like'],
      dtype='object')

## Classfier building

先將資料集切割成train和valid set，實驗後使用LGBMClassifier並tune parameters來找出準確率最高的model。

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train_all = df_drop.values[:train.shape[0]] # [0:27643]
y_train_all = (train['Popularity'].values == 1).astype(int)
X_test = df_drop.values[train.shape[0]:] # [27643:]

columns_to_scale = [1,2,3,4,5,8,9,10,11,12]
X_train_all[:, columns_to_scale] = sc.fit_transform(X_train_all[:, columns_to_scale])
X_test[:, columns_to_scale] = sc.transform(X_test[:, columns_to_scale])

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_all, y_train_all, test_size=0.2, random_state=0)

In [24]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

lgbm = Pipeline([('ct', trans_tt),
                 ('clf', LGBMClassifier(force_row_wise=True, random_state=0, learning_rate=0.005, n_estimators=500, verbose=-1))])

# CV
# 對完整的訓練數據進行訓練
print('[auc (10-fold cv)]')
scores = cross_val_score(estimator=lgbm, X=X_train_all, y=y_train_all, cv=10, scoring='roc_auc')
print(f'LGBMClassifier: {scores.mean():.4f} (+/-{scores.std():.4f})')

# 同時獲取訓練集的分數和估計器實例
# scores = cross_validate(estimator=clf, X=X_train_all, y=y_train_all, cv =10,scoring='roc_auc', \
#                             return_train_score=True, return_estimator=True)
# print(f"train score: {np.mean(scores['train_score']):.4f} (+/-{np.std(scores['train_score']):.4f}")
# print(f"valid score: {np.mean(scores['test_score']):.4f} (+/-{np.std(scores['test_score']):.4f}")

# 切成訓練集和驗證集分數
lgbm.fit(X_train, y_train)
print(f'train scroe: {roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]):.4f}')
print(f'valid score: {roc_auc_score(y_valid, lgbm.predict_proba(X_valid)[:, 1]):.4f}')

[auc (10-fold cv)]
LGBMClassifier: 0.6063 (+/-0.0158)
train scroe: 0.6851
valid score: 0.5930


In [25]:
best_clf = lgbm

y_score = best_clf.predict_proba(X_test)[:, 1]

test_pred = pd.DataFrame({'Id': test['Id'], 'Popularity': y_score})
test_pred.to_csv('submission.csv', index=False)

## Conclusion

此次competition，我學習了如何使用Beautiful soup來抓取html檔案中特定tag的資料，可以幫助我抽取出想要的features。  
此外，抽取後還必須透過regular expression等方式來進行data cleaning，所以必須對training data有一定的了解。  
找尋合適的classfier也很重要，我在train RandomForestClassifier的時候CPU都會燒到很高，推測是運算量較大，但結果並不一定更佳。