# DataLab Cup 1: Text Feature Engineering

Team name: IDUDL
Team members: 110062619 楊淨富 112062611 吳明真 112138502 陳炫妙

## Load datasets

In [1]:
import pandas as pd

# Dataset format
# train.csv contains 27643 data points (news articles) with the attributes 'Id', 'Page content', and 'Popularity'
# test.csv contains 11847 data points with the attributes 'Id' and 'Page content'

train  = pd.read_csv('./dataset/train.csv')
test  = pd.read_csv('./dataset/test.csv')

## Text Preprocessing

### Functions for Feature Extraction

* author:文章作者(去掉By/by)
* time:可再細分成Year/Month/Day/Hour/Minute/Second
* weekday:星期幾
* num_image:文章中圖片的數量
* num_link:文章中超連結的數量
* len_title:標題的長度(有幾個英文單字)
* len_article:文章的長度(有幾個英文單字)
* delta_days:抓取發文日期距離年底的天數
* article:文章的實際內容

In [2]:
# import re
# from bs4 import BeautifulSoup
# from datetime import datetime

# def author_extract(text):
#     soup = BeautifulSoup(text, 'html.parser')
#     return(soup.find(attrs = {'class': 'author_name'}).get_text().replace('By','').replace('by','').strip())

# def time_extract(text):
#     soup = BeautifulSoup(text,"html.parser")
#     return(soup.find('time').get_text())

# def datetime_extract(text):
#     # Find date and time 
#     soup = BeautifulSoup(text, 'html.parser')
#     date_time = soup.find('time').get_text()
#     if date_time != None:
#         weekday = (int(pd.Timestamp(date_time.split(' ')[0]).dayofweek)) # 0: Monday, 1:Tuesday, ...
#     else:
#         weekday = 7 # No publish date
#         date_time = '2013-06-19 15:04:30 UTC'

#     # Define a regular expression pattern to match the updated date and time format
#     date_pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC)'

#     # Use re.search to find and capture the date and time
#     match_obj = re.search(date_pattern, date_time)

#     if match_obj:
#         date_time = match_obj.group(1)

#     # Split the date and time components
#     date_components = date_time.split(' ')
#     date, time = date_components[0], date_components[1]

#     # Extract individual date and time elements
#     date_parts = date.split('-')
#     year, month, day = date_parts[0], date_parts[1], date_parts[2]

#     time_parts = time.split(':')
#     hour, minute, second = time_parts[0], time_parts[1], time_parts[2]
#     return year, month, day, weekday, hour, minute, second

# def find_weekend(text):
#     date_str = time_extract(text).split(' ')[0]
#     if date_str == '':
#         return(7)
#     date = pd.Timestamp(date_str)
#     weekday = date.dayofweek
#     return(int(weekday))

# def num_image(text):
#     soup = BeautifulSoup(text, 'html.parser')
#     return len(soup.findAll('img'))

# def num_link(text):
#     soup = BeautifulSoup(text, 'html.parser')
#     return len(soup.body.find_all('a'))

# def len_title(text):
#     soup = BeautifulSoup(text,"html.parser")
#     title_list = soup.find('h1').get_text()
#     return len(title_list)

# def len_article(text):
#     soup = BeautifulSoup(text,"html.parser")
#     content = soup.body.find('section', {'class': 'article-content'}).get_text()
#     return len(content)

# def day_delta(text):
#     time_str = time_extract(text)
#     if time_str == '':
#         return 0
#     time_arr = time_str.split(' ')

#     year = time_arr[0].split('-')[0]
#     target_str = year + '-12-31 23:59:59'
#     target = datetime.strptime(target_str, '%Y-%m-%d %H:%M:%S')
#     day = datetime.strptime(time_arr[0] + ' ' + time_arr[1], '%Y-%m-%d %H:%M:%S')
#     delta = (target - day).days
#     return delta

# def article_extract(text):
#     # remove HTML tags
#     soup = BeautifulSoup(text, 'html.parser')

#     paragraphs = soup.find_all('p')
#     article = []
#     for p in paragraphs:
        
#         text = p.get_text()

#         # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
#         r = r'(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
#         emoticons = re.findall(r, text)
#         text = re.sub(r, '', text)

#         # replace('-','') removes nose of emoticons
#         text = re.sub(r'[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', '')

#         # Output the text
#         article.append(text.strip())
#     article_str = ''.join(article)

#     return article_str

# # Example usage
# print('[Author]:', author_extract(train['Page content'][1]))
# print('[Datetime]:', time_extract(train['Page content'][1]))
# print('[Weekday]:', find_weekend(train['Page content'][1]))
# print('[# images]:', num_image(train['Page content'][1]))
# print('[Length of titles]:', len_title(train['Page content'][1]))
# print('[Length of articles]:', len_article(train['Page content'][1]))
# print('[Article]:', article_extract(train['Page content'][1]))
# print('[Days to the last day]:', day_delta(train['Page content'][1]))
# print('[DAY]', datetime_extract(train['Page content'][1]))

In [3]:
import re
from bs4 import BeautifulSoup
from datetime import datetime

def preprocessor(text):
    soup = BeautifulSoup(text, 'html.parser')
    
    # Find the author
    article_info = soup.find('div', {'class': 'article-info'})
    author_name = soup.find(attrs={'class': 'author_name'})
    if author_name:
        author = author_name.get_text().replace('By', '').replace('by', '').strip()
    elif article_info.span:
        author = article_info.span.string
    else:
        author = article_info.a.string
    
    author = author.lower()
    
    # Find weekday
    date_time = soup.find('time').get_text()
    if date_time != None:
        date = pd.Timestamp(date_time.split(' ')[0])
        if pd.notna(date_time):
            weekday = date.dayofweek # 0: Monday, 1:Tuesday, ...
        else:
            weekday = 7
    else:
        weekday = 7 # No publish date
        date_time = '2013-06-19 15:04:30 UTC'

    # Define a regular expression pattern to match the updated date and time format
    date_pattern = r'((\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}) UTC)'

    # Use re.search to find and capture the date and time
    match_obj = re.search(date_pattern, date_time)
    if match_obj:
        year, month, day, hour, minute, second = match_obj.groups()[1:7]
    else:
        year, month, day, hour, minute, second = 2013,6,19,15,4,30
        
#     # Split the date and time components
#     date_components = date_time.split(' ')
#     date, time = date_components[0], date_components[1]

#     # Extract individual date and time elements
#     date_parts = date.split('-')
#     year, month, day = date_parts[0], date_parts[1], date_parts[2]

#     time_parts = time.split(':')
#     hour, minute, second = time_parts[0], time_parts[1], time_parts[2]
    
    # Find the topic
#     topic_element = soup.find(attrs={'class': 'article-topics'})
#     topic = topic_element.get_text().replace('Topics', '').replace(':', '').replace(',', '').strip().lower() if topic_element else ''
    a_list = soup.body.find('footer', {'class': 'article-topics'}).find_all('a')
    topic_list = [a.string.strip().lower() for a in a_list]
    topic = ' '.join([re.sub(r'\s+', '_', t) for t in topic_list])
    
    
    # Find the title
    title = soup.body.h1.string.strip().lower()
    
    # Find the # of images
    num_image = len(soup.body.find_all('img'))
    
    # Find the # of links
    num_link = len(soup.body.find_all('a'))
    
    # Find the len of title
    len_title = len(soup.find('h1').get_text().split())
    
    # Find the Len of article
    len_article = len(soup.body.find('section', {'class': 'article-content'}).get_text().split())
    
    # Fine the delta days to the last day of the year
    time_str = soup.find('time').get_text()
    if time_str == '':
        delta = 0
    else:
        time_arr = time_str.split(' ')

        year = time_arr[0].split('-')[0]
        target_str = year + '-12-31 23:59:59'
        target = datetime.strptime(target_str, '%Y-%m-%d %H:%M:%S')
        day_format = datetime.strptime(time_arr[0] + ' ' + time_arr[1], '%Y-%m-%d %H:%M:%S')
        delta = (target - day_format).days
    
    return author, year, month, day, weekday, hour, minute, second, topic, \
            title, num_image, num_link, len_title, len_article, delta

# feature_list = []
# feature_list.append(preprocessor(train['Page content'][0]))

feature_list = []
for text in train['Page content']:
    feature_list.append(preprocessor(text))
                      
for text in test['Page content']:
    feature_list.append(preprocessor(text))
    
df_extract = pd.DataFrame(
    feature_list,
    columns=['Author', 'Year', 'Month', 'Day', 'Weekday','Hour', 'Minute', 'Second', 'Topic',\
             'Title', 'Num image', 'Num link', 'Len title', 'Len article', 'Delta']
)

In [4]:
df_extract.head()

Unnamed: 0,Author,Year,Month,Day,Weekday,Hour,Minute,Second,Topic,Title,Num image,Num link,Len title,Len article,Delta
0,clara moskowitz,2013,6,19,2.0,15,4,30,asteroid asteroids challenge earth space u.s. ...,nasa's grand challenge: stop asteroids from de...,1,21,8,577,195
1,christina warren,2013,3,28,3.0,17,40,55,apps_and_software google open_source opn_pledg...,google's new open source patent pledge: we won...,1,16,12,305,278
2,sam laird,2014,5,7,2.0,19,15,20,entertainment nfl nfl_draft sports television,ballin': 2014 nfl draft picks get to choose th...,1,9,12,1114,238
3,sam laird,2013,10,11,4.0,2,26,50,sports video videos watercooler,cameraperson fails deliver slapstick laughs,0,11,5,278,81
4,connor finnegan,2014,4,17,3.0,3,31,43,entertainment instagram instagram_video nfl sp...,nfl star helps young fan prove friendship with...,51,14,10,1370,258


In [5]:
df_copy = df_extract.copy()
df_drop = df_copy.drop(columns=['Minute', 'Second'])

In [6]:
df_drop.head()

Unnamed: 0,Author,Year,Month,Day,Weekday,Hour,Topic,Title,Num image,Num link,Len title,Len article,Delta
0,clara moskowitz,2013,6,19,2.0,15,asteroid asteroids challenge earth space u.s. ...,nasa's grand challenge: stop asteroids from de...,1,21,8,577,195
1,christina warren,2013,3,28,3.0,17,apps_and_software google open_source opn_pledg...,google's new open source patent pledge: we won...,1,16,12,305,278
2,sam laird,2014,5,7,2.0,19,entertainment nfl nfl_draft sports television,ballin': 2014 nfl draft picks get to choose th...,1,9,12,1114,238
3,sam laird,2013,10,11,4.0,2,sports video videos watercooler,cameraperson fails deliver slapstick laughs,0,11,5,278,81
4,connor finnegan,2014,4,17,3.0,3,entertainment instagram instagram_video nfl sp...,nfl star helps young fan prove friendship with...,51,14,10,1370,258


In [7]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer(text):
    if type(text) == np.ndarray:
        text = text[0]
    return re.split(r'\s+', text.strip())

def tokenizer_stem_nostop(text):
    porter= PorterStemmer()
    
    if type(text) == np.ndarray:
        text = text[0]
    text = re.sub(r"([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text) # it's -> its
    text = re.sub(r'\.', '', text)
    text = re.sub(r'[^\w]+', ' ', text)
    
    return [porter.stem(w) for w in re.split(r'\s+', text.strip()) \
           if w not in stop and re.match('[a-zA-Z]+', w)]

def tokenizer_lem(text):
    if type(text) == np.ndarray:
        text = text[0]
    text = re.sub(r"([\w]+)'[\w]+", (lambda match_obj: match_obj.group(1)), text)
    text = re.sub(r'\.', '', text)
    text = re.sub(r'[^\w]+', ' ', text)
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w, pos="v") for w in re.split(r'\s+', text.strip())]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

trans_att = ColumnTransformer(
    [('Author', CountVectorizer(tokenizer=tokenizer, lowercase=False), [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [6]),
     ('Title', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [7])],
    n_jobs=-1,
    remainder='passthrough'
)

trans_tt = ColumnTransformer(
    [('Author', 'drop', [0]),
     ('Topic', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [6]),
     ('Title', CountVectorizer(tokenizer=tokenizer_stem_nostop, lowercase=False), [7])],
    n_jobs=-1,
    remainder='passthrough'
)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train_all = df_drop.values[:train.shape[0]] # [0:27643]
y_train_all = (train['Popularity'].values == 1).astype(int)
X_test = df_drop.values[train.shape[0]:] # [27643:]

columns_to_scale = [1,2,3,4,5,8,9,10,11,12]
X_train_all[:, columns_to_scale] = sc.fit_transform(X_train_all[:, columns_to_scale])
X_test[:, columns_to_scale] = sc.transform(X_test[:, columns_to_scale])

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_all, y_train_all, test_size=0.2, random_state=0)

In [11]:
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score

lgbm = Pipeline([('ct', trans_tt),
                 ('clf', LGBMClassifier(force_row_wise=True, random_state=0, learning_rate=0.005, n_estimators=500, verbose=-1))])

# CV
# 對完整的訓練數據進行訓練
print('[auc (10-fold cv)]')
scores = cross_val_score(estimator=lgbm, X=X_train_all, y=y_train_all, cv=10, scoring='roc_auc')
print(f'LGBMClassifier: {scores.mean():.4f} (+/-{scores.std():.4f})')

# 同時獲取訓練集的分數和估計器實例
# scores = cross_validate(estimator=clf, X=X_train_all, y=y_train_all, cv =10,scoring='roc_auc', \
#                             return_train_score=True, return_estimator=True)
# print(f"train score: {np.mean(scores['train_score']):.4f} (+/-{np.std(scores['train_score']):.4f}")
# print(f"valid score: {np.mean(scores['test_score']):.4f} (+/-{np.std(scores['test_score']):.4f}")

# 切成訓練集和驗證集分數
lgbm.fit(X_train, y_train)
print(f'train scroe: {roc_auc_score(y_train, lgbm.predict_proba(X_train)[:, 1]):.4f}')
print(f'valid score: {roc_auc_score(y_valid, lgbm.predict_proba(X_valid)[:, 1]):.4f}')

[auc (10-fold cv)]
LGBMClassifier: 0.6056 (+/-0.0157)
train scroe: 0.6817
valid score: 0.5926


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

rf = Pipeline([('ct', trans_tt),
                ('clf', RandomForestClassifier(n_jobs=-1, random_state=0, n_estimators=300))])

print('[auc (10-fold cv)]')
scores = cross_val_score(estimator=rf, X=X_train_all, y=y_train_all, cv=10, scoring='roc_auc')
print(f'RandomForestClassifier: {scores.mean():.4f} (+/-{scores.std():.4f})')

rf.fit(X_train, y_train)
print(f'train scroe: {roc_auc_score(y_train, rf.predict_proba(X_train)[:, 1]):.4f}')
print(f'valid score: {roc_auc_score(y_valid, rf.predict_proba(X_valid)[:, 1]):.4f}')

[auc (10-fold cv)]
RandomForestClassifier: 0.5936 (+/-0.0155)
train scroe: 1.0000
valid score: 0.5884


In [13]:
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

xgb = Pipeline([('ct', trans_tt),
                ('clf', XGBClassifier(verbosity=0, n_estimators=500))])

print('[auc (10-fold cv)]')
scores = cross_val_score(estimator=xgb, X=X_train_all, y=y_train_all, cv=10, scoring='roc_auc')
print(f'XGBoostClassifier: {scores.mean():.4f} (+/-{scores.std():.4f})')

xgb.fit(X_train, y_train)
print(f'train scroe: {roc_auc_score(y_train, xgb.predict_proba(X_train)[:, 1]):.4f}')
print(f'valid score: {roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1]):.4f}')

[auc (10-fold cv)]
XGBoostClassifier: 0.5747 (+/-0.0149)
train scroe: 0.9219
valid score: 0.5635


In [15]:
best_clf = lgbm

y_score = best_clf.predict_proba(X_test)[:, 1]

test_pred = pd.DataFrame({'Id': test['Id'], 'Popularity': y_score})
test_pred.to_csv('submission.csv', index=False)