# TFIDF
* Using tf-idf as our feature vector
* Classify usins RandomForest-Xgboost

In [0]:
# import package
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc, f1_score
from sklearn.model_selection import KFold
from xgboost import XGBClassifier

In [0]:
# parameters
NUM_CLASS = 6

# load data
DATA = pd.read_csv('train_tokenize_nostem.csv')
LABEL = DATA.loc[:,'BACKGROUND':'OTHERS']
ID = DATA.loc[:, 'Id']
LENGTH = DATA.loc[:,'LENGTH']
TOKEN = DATA.loc[:,'TOKEN']
del DATA

## Initiate TfidfVectorizer
* might change **max_features** here, incase too large matrix which lead to curse of dimentionality

In [0]:
# prepare TFIDF and LABEL (ignore examples with LENGTH==1)
TFIDF = TfidfVectorizer(max_features=1000)

train_TFIDF = TFIDF.fit_transform(TOKEN[LENGTH>1])
train_feature_name = TFIDF.get_feature_names()

train_TFIDF = train_TFIDF.todense() # output from TfidfVectorizer is a special sparse matrix format, convert to normal np.matrix
train_LABEL = LABEL[LENGTH>1]

# reset index since we drop some rows
train_LABEL.reset_index(drop=True, inplace=True) # do it inplace and drop the original index

In [0]:
# inspect the result TFIDF
print(train_TFIDF.shape)
print(train_TFIDF[0:5])
print(train_LABEL.shape)
print(train_LABEL.head())
print(train_feature_name[0:10])

(46836, 1000)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(46836, 6)
   BACKGROUND  OBJECTIVES  METHODS  RESULTS  CONCLUSIONS  OTHERS
0           1           0        0        0            0       0
1           0           1        0        0            0       0
2           0           0        1        0            0       0
3           0           0        1        0            0       0
4           0           0        0        1            0       0
['abil', 'abl', 'about', 'abstract', 'acceler', 'access', 'accord', 'account', 'accur', 'accuraci']


In [0]:
# Training stuff
clf = XGBClassifier(max_depth=5, n_estimators=500,learning_rate=0.1, colsample_bytree=1)
CV = KFold(n_splits=5,shuffle=True) 
RESULT = {} # for storing result

## Start training
* Implement 5-fold CV to test the model
* Seperate the task into 6 pieces, train a classifier for each label (since each lablel are not mutually exclusive)
* Haven't store the model here. the below section is just for testing...
* The following training take **REALLY LONG** time

In [0]:
print("RESULT:")
#start training
F1 = [] #store the result F1-score
for label in LABEL.columns:
    train_y = train_LABEL[label]
    
    for train, val in CV.split(train_TFIDF,train_y):
        
        clf.fit(train_TFIDF[train], train_y[train])
        pred = clf.predict(train_TFIDF[val])
        
        f1 = f1_score(pred, train_y[val], average='binary')
        F1.append(f1)
        
    print('{} result f1_score = {}'.format(label, np.mean(F1)))
    RESULT[label] = np.mean(F1)

### RESULT:
* BACKGROUND   f1_score = 0.6736502513183271
* OBJECTIVES   f1_score = 0.5455485249051981
* METHODS      f1_score = 0.52920643474413
* RESULTS      f1_score = 0.5361403224106451
* CONCLUSIONS  f1_score = 0.4580933006792408
* OTHERS       f1_score = 0.16020985052078546

# Then, retrain on whole training set and apply on test data

In [0]:
# load test data
TESTDATA = pd.read_csv('test_tokenize_nostem.csv')

test_TFIDF = TFIDF.transform(TESTDATA['TOKEN'])
test_TFIDF = test_TFIDF.todense()

print(test_TFIDF.shape)

(131166, 1000)


In [0]:
# Retrain on whole training data
for label in LABEL.columns:
    if label != 'OTHERS':
      # init a new classifier
      clf = XGBClassifier(max_depth=5, n_estimators=500,learning_rate=0.1, colsample_bytree=1)
      # training
      train_y = train_LABEL[label]
      clf.fit(train_TFIDF, train_y)
      # Predict
      y_pred = clf.predict_proba(test_TFIDF)
      y_pred[TESTDATA['LENGTH']==1] = 0 # force the length==1 sentences to OTHERS
      # store the result 
      TESTDATA[label] = y_pred

In [0]:
# Assign to OTHERS
TESTDATA['OTHERS'] = 0  # init as 0
TESTDATA.loc[TESTDATA['LENGTH']==1, 'OTHERS'] = 1

for _,row in TESTDATA.iterrows():
  if row['BACKGROUND':'CONCLUSIONS'].sum() == 0:
    row['OTHERS'] = 1

In [0]:
# print some result
TESTDATA.head()

Unnamed: 0,Id,TOKEN,LENGTH,BACKGROUND,OBJECTIVES,METHODS,RESULTS,CONCLUSIONS,OTHERS
0,T00001,"['mobil', 'crowdsens', 'is', 'a', 'promis', 'p...",23.0,1,0,0,0,0,0
1,T00001,"['as', 'a', 'fundament', 'properti', 'of', 'mo...",38.0,1,0,0,0,0,0
2,T00001,"['therefor', 'a', 'mechan', 'is', 'requir', 'f...",34.0,0,0,0,0,0,0
3,T00001,"['in', 'this', 'paper', 'we', 'develop', 'a', ...",28.0,0,1,0,0,0,0
4,T00001,"['via', 'theoret', 'analysi', 'we', 'demonstr'...",10.0,0,0,0,1,0,0


In [0]:
# save to csv file
TESTDATA.to_csv('TFIDF+Xgb.csv', index=False)

## More to do
* Change the TFIDF ```max_features```
* Change the tokenize method
  * Here I use my own method
  * Can use ```nltk.tokenizer``` instead (Then, no need to run **TOKENIZE.ipynb** beforehand)
* Since **OTHERS** is mutually exclusive with other labels, it might not need to be predicted (the version above already implement this thought)

## Change Log
### 2019/12/10
  * Change ```clf.predict``` to ```clf.predict_proba``` to output category probability in order to facilitate the later ensembling
  * Remove the part that directly assign **OTHERS** to the row that with whole zeors vector - to reduce the noise
  * Using *_nostem* version
  * Append Sentences position and Abstract length