In [1]:
import pandas as pd

In [2]:
train_set = pd.read_csv('offsite-test-material/offsite-tagging-training-set (1).csv')
test_set = pd.read_csv('offsite-test-material/offsite-tagging-test-set (1).csv')

In [3]:
train_set

Unnamed: 0,id,tags,text
0,3443,足球,利物浦重賽擊敗乙組仔　英足盃過關 英格蘭足總盃第三圈今晨重賽，貴為英超勁旅的利物浦上場被乙組...
1,76056,足球,【中超】恒大「暴力戰」絕殺國安　楊智反重力插水惹爭議（有片） 中超首輪賽事重頭戲，廣州恒大主...
2,93405,足球,【歐霸決賽】阿積士控球率起腳佔優　隊長卡拉臣輸波不服氣 阿積士以歐洲主要決賽最年輕、平均22...
3,26767,足球,【歐國盃】韋莫斯澄清更衣室未內訌　盼以團結力量挫愛爾蘭 今晚3場直播\r\r\nE組｜比利時...
4,20843,梁振英,王維基參選　點解？ 王維基在宣布有意出選的記者會上，打出ABC，Anyone But CY的...
5,74481,足球,【港超】大埔5：1炒飛馬　近7仗第5勝　與前四只差兩分 港超周日早場便出現大比數戰果，主場出...
6,52914,美國大選,回看內政兩大建樹　奧巴馬醫保前景難料　亮麗經濟數字埋隱憂 在任近8年，美國總統奧巴馬在內政上...
7,94059,足球,【牛丸退任圖輯】一年半狂破紀錄　由銅紫荊星章到亞洲最佳女教練 從來，女教練執教男子職業足球隊...
8,41325,足球,【足球】拉爾拉拿絕殺　艾拿戴斯︰英格蘭首捷因幸運硬幣 英格蘭主帥艾拿戴斯首戰取得好開始，憑拉...
9,5203,美國大選,"美國民主共和兩黨初選早期重要日子 二月三月 <p><span style=""font-siz..."


In [4]:
train_set['text'].apply(lambda x: len(x)).describe()

count     3894.000000
mean      1257.410375
std        916.709608
min        105.000000
25%        710.000000
50%       1024.500000
75%       1539.750000
max      19005.000000
Name: text, dtype: float64

Strip: 

1) HTML tags,

2) URLS, 

3) Datetime,

4) Numbers, 

5) Random unicode


https://docs.python.org/2/library/htmlparser.html

# Data Cleansing and Processing

### Check invalid values

In [5]:
train_set[train_set['text'].isnull()].count(), train_set[train_set['tags'].isnull()].count()

(id      0
 tags    0
 text    0
 dtype: int64, id      0
 tags    0
 text    0
 dtype: int64)

In [6]:
train_set[train_set['text'].apply(lambda x: len(x.replace(' ',''))) == 0].count(), \
train_set[train_set['tags'].apply(lambda x: len(x.replace(' ',''))) == 0].count()

(id      0
 tags    0
 text    0
 dtype: int64, id      0
 tags    0
 text    0
 dtype: int64)

### Text Cleaning

In [7]:
import re
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.convert_charrefs = True
        self.strict = False
        self.fed = []
    def handle_data(self, data):
        self.fed.append(data)
    def get_data(self):
        return ''.join(self.fed)
    
class TextCleaner():
    def __init__(self, html, text = None):
        self.html = html
        if not text:
            self.text = self.strip_tags()
        else:
            self.text = text
    def strip_tags(self):
        s = MLStripper()
        s.feed(self.html)
        return s.get_data()
    def remove_url(self):
        return re.sub(r'https?:\/\/\S+', '', self.text, flags = re.MULTILINE)    
    def remove_date(self):
        return re.sub(r'（?\d*月\d*日）?', '', self.text, flags = re.MULTILINE)    
    def remove_time(self):
        return re.sub(r'【?\d*:\d*】?', '', self.text, flags = re.MULTILINE)    
    def remove_digits(self):
        return re.sub(r'【?\d*\.?\d*】?', '', self.text, flags = re.MULTILINE)    
    def remove_newlines(self):
        return re.sub(r'[/\r/\n/\t]*', '', self.text, flags = re.MULTILINE).replace('\u3000', '').replace('\xa0', '')    
    def get_clean_text(self):
        self.text = self.remove_url()
        self.text = self.remove_time()
        self.text = self.remove_date()
        self.text = self.remove_digits()
        self.text = self.remove_newlines()
        return self.text
    
def clean_text(html):
    cleaner = TextCleaner(html)
    return cleaner.get_clean_text()

In [9]:
train_set['cleaned'] = train_set['text'].apply(lambda x: clean_text(x) )

In [10]:
train_set['cleaned'].apply(lambda x: len(x)).describe()

count     3894.000000
mean       862.809964
std        522.950909
min         27.000000
25%        541.000000
50%        733.000000
75%       1030.000000
max      12067.000000
Name: cleaned, dtype: float64

### Encoding Labels

In [14]:
from sklearn import preprocessing

In [15]:
le = preprocessing.LabelEncoder()
le.fit(train_set['tags'])
le.classes_

array(['梁振英', '美國大選', '足球'], dtype=object)

In [16]:
train_set['tags_id'] = le.transform(train_set['tags']) 

In [17]:
train_set["tags"].value_counts(), train_set["tags_id"].value_counts()

(足球      2123
 梁振英      929
 美國大選     842
 Name: tags, dtype: int64, 2    2123
 0     929
 1     842
 Name: tags_id, dtype: int64)

# Split dataset into traing set, testing set and validation set (stratified)

Train 60%, Test 20%, Validation 20%

In [24]:
from sklearn.model_selection import StratifiedShuffleSplit

In [54]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)

for train_index, test_valid_index in split.split(train_set, train_set["tags_id"]):
    strat_train_set = train_set.loc[train_index]
    strat_test_valid_set = train_set.loc[test_valid_index]

In [55]:
strat_test_valid_set = strat_test_valid_set.reset_index()

In [56]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

for train_index, test_valid_index in split.split(strat_test_valid_set, strat_test_valid_set["tags_id"]):
    strat_test_set = strat_test_valid_set.loc[train_index]
    strat_valid_set = strat_test_valid_set.loc[test_valid_index]

In [57]:
strat_train_set['tags_id'].value_counts().index

Int64Index([2, 0, 1], dtype='int64')

In [58]:
def cat_proportions(data):
    return data["tags_id"].value_counts() / len(data)

compare_props = pd.DataFrame({
    "Tag": le.inverse_transform(strat_train_set['tags_id'].value_counts().index),
    "Overall": cat_proportions(train_set),
    "Stratified Train": cat_proportions(strat_train_set),
    "Stratified Test": cat_proportions(strat_test_set),
    "Stratified Valid": cat_proportions(strat_valid_set),
}).sort_index()
compare_props["Train. %error"] = 100 * compare_props["Stratified Train"] / compare_props["Overall"] - 100
compare_props["Test. %error"] = 100 * compare_props["Stratified Test"] / compare_props["Overall"] - 100
compare_props["Valid. %error"] = 100 * compare_props["Stratified Valid"] / compare_props["Overall"] - 100

In [59]:
compare_props

Unnamed: 0,Tag,Overall,Stratified Train,Stratified Test,Stratified Valid,Train. %error,Test. %error,Valid. %error
0,梁振英,0.238572,0.238442,0.238768,0.238768,-0.054651,0.081941,0.081941
1,美國大選,0.21623,0.216182,0.215661,0.216945,-0.022472,-0.263143,0.330529
2,足球,0.545198,0.545377,0.545571,0.544288,0.032827,0.068508,-0.166947


In [62]:
train_set.to_pickle("train_set.pkl")
strat_train_set.to_pickle('strat_train_set.pkl')
strat_test_set.to_pickle('strat_test_set.pkl')
strat_valid_set.to_pickle('strat_valid_set.pkl')