# text's feature engineering: turn unstructure to  structure

In [1]:
import re
import pandas as pd
import numpy as np
import pickle
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/home/kyoyachuan/class_text_mining'

In [2]:
df = pd.read_csv('data/article_preprocessed.csv')

In [74]:
## load 'article_cutted'
with open("article_cutted", "rb") as file:
    sentences = pickle.load(file)

## define y (push > boo)

In [4]:
## drop data
diff_threshold = 20
df = df[abs(df['push']-df['boo']) > diff_threshold].copy()

In [5]:
## define y
df['type'] = np.clip(df['push']-df['boo'], 0, 1)
df = df.reset_index(drop=True)

In [6]:
df['type'].value_counts()

1    17318
0     1134
Name: type, dtype: int64

## simple feature

In [7]:
## word count
## http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)
df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\u4e00-\u9fff]')

In [8]:
## punctuation count
df['punctuation'] = df['content'].str.replace('[\w\s]', '')
df['punctuation_count'] = df['punctuation'].str.len()

In [9]:
## question mark count
df['question_count'] = df['punctuation'].str.count('[?？]')

In [10]:
## drop punctuation column
df = df.drop(['punctuation'],axis=1)

In [11]:
df.iloc[:5, -4:]

Unnamed: 0,type,word_count,punctuation_count,question_count
0,1,175,0,0
1,1,145,4,1
2,1,393,17,8
3,1,295,15,6
4,1,41,4,0


In [12]:
## compute correlation
df.iloc[:, -4:].corr()

Unnamed: 0,type,word_count,punctuation_count,question_count
type,1.0,-0.0451,-0.024124,-0.056966
word_count,-0.0451,1.0,0.738419,0.5349
punctuation_count,-0.024124,0.738419,1.0,0.34215
question_count,-0.056966,0.5349,0.34215,1.0


## bag of words

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
## define transformer (轉換器)
vectorizer = CountVectorizer()
count = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [77]:
count

<252229x372654 sparse matrix of type '<class 'numpy.int64'>'
	with 7700068 stored elements in Compressed Sparse Row format>

In [78]:
## save data as pickle format
with open("article_count", "wb") as file:
    pickle.dump([vectorizer, count], file)

### select top 10 frequency of words

In [79]:
## create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [80]:
## columnwise sum: words frequency
sum_ = np.array(count.sum(axis=0))[0]

In [81]:
## top 10 frequency's wordID
most_sum_id = sum_.argsort()[::-1][:10].tolist()
most_sum_id

[73627, 198934, 95899, 37001, 243708, 258736, 257519, 305714, 256024, 283981]

In [82]:
## print top 10 frequency's words
features = [id2word[i] for i in most_sum_id]
features

['八卦', '有沒有', '台灣', '一個', '現在', '知道', '真的', '覺得', '看到', '肥宅']

In [106]:
## print the data
data = pd.DataFrame(count[df.idx.as_matrix(),:][:,most_sum_id].toarray(), columns=features)
data[:5]

Unnamed: 0,八卦,有沒有,台灣,一個,現在,知道,真的,覺得,看到,肥宅
0,1,1,1,0,2,0,0,1,0,1
1,1,1,0,0,0,0,0,0,0,0
2,0,0,0,1,1,0,0,1,0,0
3,1,2,0,2,0,2,0,0,0,0
4,0,0,0,1,0,0,0,0,1,0


In [107]:
df.iloc[0].content

'反核覺青現在最強招式就是 不然燃料棒放你家 肥宅我覺得 把燃料棒放到其他國家不就好了 一定會有缺錢的國家 台灣塞錢給他們 買他們國家的空間放 一來燃料棒問題解決 核電重啟 台灣缺點問題解決 大家有冷氣吹 台積電不出走 繼續救台灣 二來有買賣就有貪污空間 政客也有賺頭 不會像現在沒糖吃該該叫 送錢出去 邦交國搞不好也會多幾個 簡直是雙贏 核電燃料棒 跟其他國家買空間放不就好了 有沒有相關八卦 '

In [108]:
## compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()

Unnamed: 0,type,八卦,有沒有,台灣,一個,現在,知道,真的,覺得,看到,肥宅
type,1.0,-0.056491,0.000898,-0.048109,-0.032585,-0.01812,-0.019432,-0.021593,-0.012116,-0.048172,-0.013955
八卦,-0.056491,1.0,0.431786,-0.001901,0.062416,0.05267,0.091136,0.084516,0.066589,0.066047,0.07721
有沒有,0.000898,0.431786,1.0,0.032712,0.094241,0.105734,0.144891,0.105919,0.095098,0.089291,0.043573
台灣,-0.048109,-0.001901,0.032712,1.0,0.194281,0.188974,0.166716,0.162471,0.125782,0.137087,-0.033735
一個,-0.032585,0.062416,0.094241,0.194281,1.0,0.400985,0.523627,0.398092,0.370977,0.327872,0.02633
現在,-0.01812,0.05267,0.105734,0.188974,0.400985,1.0,0.418126,0.34847,0.30607,0.230148,0.009126
知道,-0.019432,0.091136,0.144891,0.166716,0.523627,0.418126,1.0,0.486961,0.39152,0.336525,0.037975
真的,-0.021593,0.084516,0.105919,0.162471,0.398092,0.34847,0.486961,1.0,0.461401,0.331108,0.039239
覺得,-0.012116,0.066589,0.095098,0.125782,0.370977,0.30607,0.39152,0.461401,1.0,0.270267,0.026513
看到,-0.048172,0.066047,0.089291,0.137087,0.327872,0.230148,0.336525,0.331108,0.270267,1.0,0.020214


# TF-IDF

In [109]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [110]:
## define transformer (轉換器)
vectorizer = TfidfVectorizer(norm=None) ## do not do normalize
tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [111]:
## save data as pickle format
with open("article_tfidf", "wb") as file:
    pickle.dump([vectorizer, tfidf], file)

### select top 10 average tf-idf of words

In [112]:
## create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [115]:
## columnwise average: words tf-idf
avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)

## set df < 20 as 0
avg[(tfidf!=0).sum(axis=0)<20] = 0

In [126]:
avg = np.array(avg)[0]

In [127]:
## top 10 tfidf's wordID
most_avg_id = avg.argsort()[::-1][:10].tolist()
most_avg_id

[90835, 325364, 157970, 263428, 357411, 5490, 47011, 33207, 51405, 183683]

In [128]:
## print top 10 tf-idf's words
features = [id2word[i] for i in most_avg_id]
features

['原告', '轉帳', '忍術', '稅後', '震度', 'charlie', '中山路', 'united', '二段', '支出']

In [129]:
## print the data
data = pd.DataFrame(tfidf[df.idx.as_matrix(),:][:,most_avg_id].toarray(), columns=features)
data[:5]

Unnamed: 0,原告,轉帳,忍術,稅後,震度,charlie,中山路,united,二段,支出
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.395045,0.0


In [130]:
## compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()

Unnamed: 0,type,原告,轉帳,忍術,稅後,震度,charlie,中山路,united,二段,支出
type,1.0,-0.004499,0.00209,0.002664,0.00197,0.007263,0.002013,0.001974,0.003889,0.001882,0.002063
原告,-0.004499,1.0,-0.000221,-0.000177,-0.000238,-0.000483,-0.000134,-0.000131,-0.000258,-0.000132,-0.000226
轉帳,0.00209,-0.000221,1.0,-0.000136,0.999764,-0.00037,-0.000102,-0.0001,-0.000198,-0.000101,0.999931
忍術,0.002664,-0.000177,-0.000136,1.0,-0.000146,-0.000296,-8.2e-05,-8e-05,-0.000158,-8.1e-05,-0.000138
稅後,0.00197,-0.000238,0.999764,-0.000146,1.0,-0.000397,-0.00011,-0.000108,-0.000213,-0.000109,0.999715
震度,0.007263,-0.000483,-0.00037,-0.000296,-0.000397,1.0,-0.000223,-0.000219,-0.000431,-0.000221,-0.000377
charlie,0.002013,-0.000134,-0.000102,-8.2e-05,-0.00011,-0.000223,1.0,-6.1e-05,-0.00012,-6.1e-05,-0.000104
中山路,0.001974,-0.000131,-0.0001,-8e-05,-0.000108,-0.000219,-6.1e-05,1.0,-0.000117,0.999799,-0.000102
united,0.003889,-0.000258,-0.000198,-0.000158,-0.000213,-0.000431,-0.00012,-0.000117,1.0,-0.000118,-0.000202
二段,0.001882,-0.000132,-0.000101,-8.1e-05,-0.000109,-0.000221,-6.1e-05,0.999799,-0.000118,1.0,-0.000103
