# text's feature engineering: turn unstructure to  structure

In [1]:
import re
import pandas as pd
import numpy as np
import pickle
import os

## turn back to main directory
os.chdir("../")
os.getcwd()

'/home/jovyan/Davis_Practice/NLP'

In [2]:
df = pd.read_csv('data/1999_preprocessed.csv')

In [3]:
## load 'article_cutted'
with open("article_cutted", "rb") as file:
    sentences = pickle.load(file)

## define y (push > boo)

In [4]:
## drop data
#diff_threshold = 20
#df = df[abs(df['push']-df['boo']) > diff_threshold].copy()

In [5]:
## define y
#df['type'] = np.clip(df['push']-df['boo'], 0, 1)
#df = df.reset_index(drop=True)

In [6]:
df['type'].value_counts()

SAP-168(QCI)     3031
BPM平台系統          1073
SAP-2X8(QSMC)     749
SAP Issue         739
GOC               268
CAMP              143
SAP-GS             44
Name: type, dtype: int64

## simple feature

In [7]:
## word count
## http://blog.csdn.net/gatieme/article/details/43235791 (中文正則表達式)
df['word_count'] = df['content'].str.count('[a-zA-Z0-9]+') + df['content'].str.count('[\u4e00-\u9fff]')

In [8]:
## punctuation count
df['punctuation'] = df['content'].str.replace('[\w\s]', '')
df['punctuation_count'] = df['punctuation'].str.len()

In [9]:
## question mark count
df['question_count'] = df['punctuation'].str.count('[?？]')

In [10]:
## drop punctuation column
df = df.drop(['punctuation'],axis=1)

In [11]:
df.iloc[:5, -4:]

Unnamed: 0,idx,word_count,punctuation_count,question_count
0,0,25,1,0
1,1,25,1,0
2,2,22,2,0
3,3,22,2,0
4,4,25,4,0


In [12]:
## compute correlation
df.iloc[:, -4:].corr()

Unnamed: 0,idx,word_count,punctuation_count,question_count
idx,1.0,-0.099505,-0.101816,-0.045965
word_count,-0.099505,1.0,0.656503,0.215064
punctuation_count,-0.101816,0.656503,1.0,0.151011
question_count,-0.045965,0.215064,0.151011,1.0


## bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
## define transformer (轉換器)
vectorizer = CountVectorizer()
count = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [15]:
count

<6047x6176 sparse matrix of type '<class 'numpy.int64'>'
	with 46025 stored elements in Compressed Sparse Row format>

In [16]:
## save data as pickle format
with open("article_count", "wb") as file:
    pickle.dump([vectorizer, count], file)

### select top 10 frequency of words

In [17]:
## create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [18]:
## columnwise sum: words frequency
sum_ = np.array(count.sum(axis=0))[0]

In [19]:
## top 10 frequency's wordID
most_sum_id = sum_.argsort()[::-1][:10].tolist()
most_sum_id

[4963, 2047, 2434, 5090, 3566, 5422, 3262, 3733, 3454, 5263]

In [20]:
## print top 10 frequency's words
features = [id2word[i] for i in most_sum_id]
features

['無法', 'sap', 'user', '登入', '協助', '處理', '使用', '問題', '分機', '系統']

In [21]:
## print the data
data = pd.DataFrame(count[df.idx.values,:][:,most_sum_id].toarray(), columns=features)
data[:5]

Unnamed: 0,無法,sap,user,登入,協助,處理,使用,問題,分機,系統
0,1,0,0,1,1,0,0,1,0,1
1,1,0,0,1,1,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0


In [22]:
df.iloc[0].content

'無法透過 CAMP 上SIT 系統 icon 的連接登入. 可否協助看一下這問題'

In [23]:
## compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()

Unnamed: 0,無法,sap,user,登入,協助,處理,使用,問題,分機,系統
無法,1.0,0.107298,-0.055462,0.350984,-0.000922,0.01202,0.161743,-0.097215,0.038422,0.070975
sap,0.107298,1.0,-0.007743,0.175514,-0.001514,0.009,0.03864,-0.03116,-0.002014,0.069977
user,-0.055462,-0.007743,1.0,0.009306,0.302069,0.312052,0.073052,0.037747,0.078424,0.022229
登入,0.350984,0.175514,0.009306,1.0,-0.009539,-0.010335,0.012937,-0.066555,0.00495,0.061949
協助,-0.000922,-0.001514,0.302069,-0.009539,1.0,0.75128,0.130673,0.098966,-0.016344,0.106479
處理,0.01202,0.009,0.312052,-0.010335,0.75128,1.0,0.134291,0.084447,-0.010419,0.123097
使用,0.161743,0.03864,0.073052,0.012937,0.130673,0.134291,1.0,-0.005337,0.014288,0.140883
問題,-0.097215,-0.03116,0.037747,-0.066555,0.098966,0.084447,-0.005337,1.0,0.052613,0.029043
分機,0.038422,-0.002014,0.078424,0.00495,-0.016344,-0.010419,0.014288,0.052613,1.0,-0.01886
系統,0.070975,0.069977,0.022229,0.061949,0.106479,0.123097,0.140883,0.029043,-0.01886,1.0


# TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
## define transformer (轉換器)
vectorizer = TfidfVectorizer(norm=None) ## do not do normalize
tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [26]:
## save data as pickle format
with open("article_tfidf", "wb") as file:
    pickle.dump([vectorizer, tfidf], file)

### select top 10 average tf-idf of words

In [27]:
## create a dictionary: id as key ; word as values
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}

In [28]:
## columnwise average: words tf-idf
avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)

## set df < 20 as 0
avg[(tfidf!=0).sum(axis=0)<20] = 0

In [29]:
avg = np.array(avg)[0]

In [30]:
## top 10 tfidf's wordID
most_avg_id = avg.argsort()[::-1][:10].tolist()
most_avg_id

[1964, 4527, 3193, 1246, 631, 4018, 6145, 4979, 2419, 858]

In [31]:
## print top 10 tf-idf's words
features = [id2word[i] for i in most_avg_id]
features

['request', '数量', '代理人', 'item', 'cs12', '專案', '驗收', '版本', 'upload', 'ec']

In [32]:
## print the data
data = pd.DataFrame(tfidf[df.idx.values,:][:,most_avg_id].toarray(), columns=features)
data[:5]

Unnamed: 0,request,数量,代理人,item,cs12,專案,驗收,版本,upload,ec
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
## compute correlation
data = pd.concat([df.type, data], axis=1)
data.corr()

Unnamed: 0,request,数量,代理人,item,cs12,專案,驗收,版本,upload,ec
request,1.0,-0.002482,-0.002975,-0.003058,-0.001895,-0.002633,-0.002984,-0.002611,-0.002436,-0.004386
数量,-0.002482,1.0,-0.002994,0.035937,0.014522,-0.00265,-0.003003,-0.002628,-0.002452,0.100154
代理人,-0.002975,-0.002994,1.0,-0.003688,-0.002286,-0.003176,-0.003599,-0.003149,-0.002939,-0.00529
item,-0.003058,0.035937,-0.003688,1.0,-0.002349,-0.003265,-0.0037,-0.003237,-0.003021,0.002516
cs12,-0.001895,0.014522,-0.002286,-0.002349,1.0,-0.002023,-0.002293,-0.002006,-0.001872,0.043523
專案,-0.002633,-0.00265,-0.003176,-0.003265,-0.002023,1.0,-0.003186,-0.002788,-0.002601,-0.004683
驗收,-0.002984,-0.003003,-0.003599,-0.0037,-0.002293,-0.003186,1.0,-0.003159,-0.002948,-0.005307
版本,-0.002611,-0.002628,-0.003149,-0.003237,-0.002006,-0.002788,-0.003159,1.0,-0.002579,-0.004643
upload,-0.002436,-0.002452,-0.002939,-0.003021,-0.001872,-0.002601,-0.002948,-0.002579,1.0,-0.004333
ec,-0.004386,0.100154,-0.00529,0.002516,0.043523,-0.004683,-0.005307,-0.004643,-0.004333,1.0
