In [1]:
import pandas as pd
from sklearn.metrics import classification_report
from skmultilearn.problem_transform import ClassifierChain
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
import nltk
from tools import *
import GetData
from collections import Counter
import sklearn.utils as us

Using TensorFlow backend.


In [2]:
lancaster_stemmer = nltk.LancasterStemmer()
stop_words = GetData.get_stopwords("data/stopwords.txt")


In [109]:
def read_data(filename, shuffle=True):
    rawdata = pd.read_csv(filename, sep="|")
    if shuffle:
        rawdata = us.shuffle(rawdata,random_state=1994)  # shuffle data
    return rawdata

def preprocess(data, content):

    # word tokenize
    X_wt = [word_tokenize(x) for x in GetData.get_content(data, content)]
    # 去停用词及文末句号
    X_st = [[del_tail_dot(word.lower()) for word in document if not (word.lower() in stop_words or word == "")]
                  for document in X_wt]
    X_ls = [[lancaster_stemmer.stem(word) for word in document] for document in X_st]
    # 字符串列表连接
    X_ls_merge = [MergeWord(document) for document in X_ls]
    return X_ls_merge

In [110]:
def data_scaler(data, train=True):
    """
    数据标准化
    """
    if train:
        ss = StandardScaler()
        train_scale = ss.fit_transform(data)
        save_model(ss, "model/flow/ss")
        return train_scale
    elif not train:
        ss = load_model("model/flow/ss")
        data = ss.transform(data)
        return data
    else:
        raise Exception("Input correct values of train: True or false")

def get_TfIdf(data,train=True):
    """
    获取TfIdf特征
    """
    if train:
        TfidfVec = TfidfVectorizer(max_df=0.1, min_df=0.01, ngram_range=(1,2),stop_words='english')
        dataTfIdf = TfidfVec.fit_transform(data)
        save_model(TfidfVec, "model/flow/tfidf")
        return dataTfIdf.toarray()
    elif not train:
        model = load_model("model/flow/tfidf")
        return model.transform(data).toarray()
    else:
        raise Exception("Input correct values of train: True or false")


def get_LDA(data, train=True):
    """
    获取LDA特征
    """
    if train:
        CntVec = CountVectorizer(min_df=0.01, ngram_range=(1, 1))
        lda = LatentDirichletAllocation(n_components=100,learning_method='batch',
                                    random_state=0)
        data = CntVec.fit_transform(data)
        data = lda.fit_transform(data)
        save_model(CntVec, "model/flow/CntVec")
        save_model(lda, "model/flow/LDA")
        return data
    elif not train:
        cntmodel = load_model("model/flow/CntVec")
        ldampdel = load_model("model/flow/LDA")
        data = cntmodel.transform(data)
        data = ldampdel.transform(data)
        return data
    else:
        raise Exception("Input correct values of train: True or false")

In [111]:
def train_model(X, y, strategy):
    X = np.array(X)
    y = np.array(y)
    # clf = SVC(C=1,kernel='rbf',probability=True, gamma='scale') # svc without class_weight
    # clf = SVC(C=10,kernel='rbf',class_weight='balanced',probability=True, gamma='scale')  # svc with class_weight
    clf = XGBClassifier(subsample=0.8, colsample_bytree=0.8, max_depth=5,n_estimators=200)
    # clf = XGBClassifier(learning_rate=0.1, n_estimators=150, max_depth=5,
    #                     min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
    #                     objective='binary:logistic', nthread=4, scale_pos_weight=1)
    print(clf)
    if strategy=='ovr':  # OneVsRest strategy also known as BinaryRelevance strategy
        ovr = OneVsRestClassifier(clf)
        ovr.fit(X, y)
        save_model(ovr, "model/flow/ovr")
        return ovr
    elif strategy=='classifier_chains':
        cc = ClassifierChain(clf)
        cc.fit(X, y)
        save_model(cc, "model/flow/cc")
        return cc
    else:
        raise Exception("Correct strategies：ovr or classifier_chains")

def predict(data):
    pass

def evaluation(y_test, preds):
    print(classification_report(y_test, preds))


In [6]:
train_name = "data/Data_flow/trainset.csv"
trainset= read_data(train_name)
save_model(trainset['DOI'], "data/Data_flow/trian_DOIs")
print("preprocess data......")
X_train_abs = preprocess(trainset,'N_ABS')
X_train_titkw = preprocess(trainset,['TITLE', 'KEY_WORDS'])
y_train = [get_multiple_label(x, ['F','I','P']) for x in trainset['FLOW']]
# print(X_train_titkw[0:10])
# print(trainset.TITLE.head(10))
y_train = np.array([get_multiple_label(x, ['F','I','P']) for x in trainset['FLOW']])
# print(y_train[0:10])

test_name = "data/Data_flow/testset.csv"
testset= read_data(test_name, shuffle=False)
X_test_abs = preprocess(testset, 'N_ABS')
X_test_titkw = preprocess(testset, ['TITLE','KEY_WORDS'])
y_test = np.array([get_multiple_label(x, ['F', 'I', 'P']) for x in testset['FLOW']])
# print(testset[0:10])
# print(X_test_titkw[0:10])
# print(y_test)

# 标签计数
print("statistics of labels:")
target = [''.join(list(map(str, e))) for e in y_train]
print(sorted(Counter(target).items()))

print("generating features......")
X_train_abs = get_TfIdf(X_train_abs, train=True)
print("the shape of tfidf features: ", X_train_abs.shape[1])
X_test_abs = get_TfIdf(X_test_abs, train=False)
# print(X_test_abs[0])
# print(X_test_abs.shape)

X_train_titkw = get_LDA(X_train_titkw, train=True)
print("the shape of LDA features: ", X_train_titkw.shape[1])
X_test_titkw = get_LDA(X_test_titkw, train=False)
# print(X_test_titkw[0])
# print(X_test_titkw.shape)

# merge data
# print(X_train_abs.shape, X_train_titkw.shape)
# print(X_test_abs.shape, X_test_titkw.shape)
X_train_merge =  merge_features(X_train_abs , X_train_titkw)
X_test_merge = merge_features(X_test_abs, X_test_titkw)

#scale data
X_train = data_scaler(X_train_merge, train=True)
X_test = data_scaler(X_test_merge,train=False)
print(X_train.shape)
print(X_test.shape)
print(type(X_train))
print(type(X_test))

preprocess data......
statistics of labels:
[('001', 229), ('010', 82), ('011', 16), ('100', 33), ('101', 3), ('110', 2), ('111', 1)]
generating features......
the shape of tfidf features:  1382
the shape of LDA features:  100
(366, 1482)
(41, 1482)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [37]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import linear_model

In [12]:
strategy = 'classifier_chains'
model =  train_model(X_train, y_train, strategy)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.8, verbosity=1)


In [13]:
train_preds = model.predict(X_train)
train_proba = model.predict_proba(X_train)
test_preds = model.predict(X_test)
test_proba = model.predict_proba(X_test)

In [69]:
def model_select(X_train,y_train,X_test,y_test):
    xgb = XGBClassifier(n_estimators=500)
    rf = RandomForestClassifier(max_depth=3,n_estimators=5000,n_jobs=-1)
    svm = SVC()
    gbdt = GradientBoostingClassifier()
    lr = linear_model.LogisticRegression()
    models = {'Xgboost':xgb, 'RandomForests': rf, 'SVM':svm,'GBDT':gbdt,"LogReg":lr}
    for name, model in models.items():
        cc = ClassifierChain(model)
        print(cc)
        cc.fit(X_train,y_train)
        predictions = cc.predict(X_test)
        print("classification report of %s: " % name )
        print(classification_report(y_test, predictions))

In [70]:
model_select(X_train, y_train, X_test, y_test)

ClassifierChain(classifier=XGBClassifier(base_score=0.5, booster='gbtree',
                                         colsample_bylevel=1,
                                         colsample_bynode=1, colsample_bytree=1,
                                         gamma=0, learning_rate=0.1,
                                         max_delta_step=0, max_depth=3,
                                         min_child_weight=1, missing=None,
                                         n_estimators=500, n_jobs=1,
                                         nthread=None,
                                         objective='binary:logistic',
                                         random_state=0, reg_alpha=0,
                                         reg_lambda=1, scale_pos_weight=1,
                                         seed=None, silent=None, subsample=1,
                                         verbosity=1),
                order=None, require_dense=[True, True])
classification report of Xgboost: 
   

  'precision', 'predicted', average, warn_for)


classification report of SVM: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00        12
           2       0.69      1.00      0.82        27

   micro avg       0.69      0.61      0.65        44
   macro avg       0.23      0.33      0.27        44
weighted avg       0.42      0.61      0.50        44
 samples avg       0.66      0.62      0.63        44

ClassifierChain(classifier=GradientBoostingClassifier(criterion='friedman_mse',
                                                      init=None,
                                                      learning_rate=0.1,
                                                      loss='deviance',
                                                      max_depth=3,
                                                      max_features=None,
                                                      max_leaf_nodes=None,
                              

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


classification report of GBDT: 
              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.67      0.33      0.44        12
           2       0.77      0.89      0.83        27

   micro avg       0.78      0.73      0.75        44
   macro avg       0.81      0.67      0.72        44
weighted avg       0.77      0.73      0.73        44
 samples avg       0.78      0.74      0.76        44

ClassifierChain(classifier=LogisticRegression(C=1.0, class_weight=None,
                                              dual=False, fit_intercept=True,
                                              intercept_scaling=1,
                                              l1_ratio=None, max_iter=100,
                                              multi_class='warn', n_jobs=None,
                                              penalty='l2', random_state=None,
                                              solver='warn', tol=0.0001,
          



classification report of LogReg: 
              precision    recall  f1-score   support

           0       0.56      1.00      0.71         5
           1       0.50      0.42      0.45        12
           2       0.77      0.74      0.75        27

   micro avg       0.67      0.68      0.67        44
   macro avg       0.61      0.72      0.64        44
weighted avg       0.67      0.68      0.67        44
 samples avg       0.70      0.70      0.68        44



  'precision', 'predicted', average, warn_for)


In [104]:
def params_seach(data, target, model,params):
    model_tosearch = ClassifierChain(model)
    model_tunning = GridSearchCV(model_tosearch, cv=5, param_grid=params,
                                 scoring='f1_weighted',verbose=5, n_jobs=-1)
    model_tunning.fit(data,target)
    print(model_tunning.best_score_)
    print(model_tunning.best_params_)
    print(model_tunning.best_estimator_)

In [86]:
params = {'classifier__max_depth':range(3,10,2),
              'classifier__subsample':[0.6,0.7,0.8,0.9,1],
              'classifier__colsample_bytree':[0.6,0.7,0.8,0.9,1]}
model = XGBClassifier(learning_rate =0.1, n_estimators=150)
params_seach(X_train,y_train,model,params)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 24.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 28.7min finished


0.6883003959738369
{'classifier__colsample_bytree': 0.8, 'classifier__max_depth': 7, 'classifier__subsample': 0.8}
ClassifierChain(classifier=XGBClassifier(base_score=0.5, booster='gbtree',
                                         colsample_bylevel=1,
                                         colsample_bynode=1,
                                         colsample_bytree=0.8, gamma=0,
                                         learning_rate=0.1, max_delta_step=0,
                                         max_depth=7, min_child_weight=1,
                                         missing=None, n_estimators=150,
                                         n_jobs=1, nthread=None,
                                         objective='binary:logistic',
                                         random_state=0, reg_alpha=0,
                                         reg_lambda=1, scale_pos_weight=1,
                                         seed=None, silent=None, subsample=0.8,
                              

In [92]:
params = {'classifier__learning_rate)':[0.001,0.01,0.1],
              'classifier__n_estimators':[150]}
model = XGBClassifier(max_depth=7, subsample=0.8, colsample_bytree=0.8)
params_seach(X_train,y_train,model,params)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed:   49.1s remaining:   12.2s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.0min finished


   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0      15.614905      0.356668         0.037699        0.007035   
1      14.792997      0.080744         0.037700        0.004610   
2      13.759011      0.817348         0.039096        0.008472   

  param_classifier__learning_rate) param_classifier__n_estimators  \
0                            0.001                            150   
1                             0.01                            150   
2                              0.1                            150   

                                              params  split0_test_score  \
0  {'classifier__learning_rate)': 0.001, 'classif...           0.738273   
1  {'classifier__learning_rate)': 0.01, 'classifi...           0.738273   
2  {'classifier__learning_rate)': 0.1, 'classifie...           0.738273   

   split1_test_score  split2_test_score  split3_test_score  split4_test_score  \
0           0.730431           0.525307           0.766314           0.6

In [102]:
xgb = XGBClassifier(max_depth=7, subsample=0.8, colsample_bytree=0.8, learning_rate=0.001, n_estimators=150)
cc = ClassifierChain(xgb)
cc.fit(X_train,y_train)
preds = cc.predict(X_test)

In [103]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.67      0.17      0.27        12
           2       0.72      0.96      0.83        27

   micro avg       0.73      0.68      0.71        44
   macro avg       0.80      0.51      0.55        44
weighted avg       0.74      0.68      0.64        44
 samples avg       0.73      0.70      0.71        44



In [106]:
params = {'classifier__max_depth':range(3,10,2),
          'classifier__n_estimators':[100,200,300,400],
          'classifier__max_features':['auto','log2','sqrt']
         }
model = RandomForestClassifier(n_jobs=-1)
params_seach(X_train,y_train,model, params)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.8min finished


0.580700623874039
{'classifier__max_depth': 9, 'classifier__max_features': 'auto', 'classifier__n_estimators': 100}
ClassifierChain(classifier=RandomForestClassifier(bootstrap=True,
                                                  class_weight=None,
                                                  criterion='gini', max_depth=9,
                                                  max_features='auto',
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100, n_jobs=-1,
                                                  oob_score=False,
       