# 读入数据

In [2]:
from instrument import read_bunch
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer

# load data from local
train_bunch_path = './data_bunch/train_bunch.dat'
validate_bunch_path = './data_bunch/validate_bunch.dat'
train_bunch = read_bunch(train_bunch_path)
validate_bunch = read_bunch(validate_bunch_path)

# 创建词向量空间（训练集和验证集）

In [4]:
stop_words_list = None
max_df = 0.8

# create TF-IDF words vector space with train data
tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_bunch.news_pic_label, tdm=[], vocabulary={})
train_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df)
tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)                # jieba 分词结果或
tfidf_train.vocabulary = train_vectorizer.vocabulary_

# create TF-IDF words vector space with validate data
tfidf_validate = Bunch(Id=validate_bunch.news_id, tdm=[], vocabulary={})
tfidf_validate.vocabulary = tfidf_train.vocabulary
validate_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df,
                                      vocabulary=tfidf_train.vocabulary)
tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_jieba)        # jieba 分词结果

# 将训练数据划分为训练集和测试集

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_train.tdm,
                                                    tfidf_train.Label,
                                                    test_size=0.3,
                                                    random_state=33)

# 构建模型

In [7]:
from sklearn.metrics import classification_report

## 1、Naive Bayes

### 1、模型测试

In [None]:
### Multinomial Naive Bayes Classifier
def classifier_naive_bayes(x_data, y_labels):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.1453)
    model.fit(x_data, y_labels)
    return model
model_naive_bayes = classifier_naive_bayes(x_train, y_train)
print('The accuracy of classifying training data with Naive Bayes is :',
      model_naive_bayes.score(x_test, y_test))
print(classification_report(y_test,model_naive_bayes.predict(x_test)))

### 2、验证集预测

In [9]:
model_naive_bayes = classifier_naive_bayes(tfidf_train.tdm,tfidf_train.Label)
predict_naive_bayes = model_naive_bayes.predict(tfidf_validate.tdm)

## 2、KNN

### 1、模型测试

In [20]:
# KNN Classifier
def classifier_knn(x_data, y_labels):
    from sklearn.neighbors import KNeighborsClassifier
    model = KNeighborsClassifier(leaf_size=40,n_neighbors=30,n_jobs=-1)
    model.fit(x_data, y_labels)
    return model

model_knn = classifier_knn(x_train, y_train)
print('The accuracy of classifying training data with KNN is :',
      model_knn.score(x_test, y_test))
print(classification_report(y_test, model_knn.predict(x_test)))

The accuracy of classifying training data with KNN is : 0.6628162816281629
             precision    recall  f1-score   support

          0       0.65      0.84      0.73      7192
          1       0.44      0.07      0.13      2357
          2       0.71      0.68      0.69      4995

avg / total       0.64      0.66      0.62     14544



### 2、验证集预测

In [None]:
model_knn = classifier_knn(tfidf_train.tdm,tfidf_train.Label)
predict_knn = model_knn.predict(tfidf_validate.tdm)

In [72]:
# 7. store the result of predict to local, and ust it to submittion
bayes_text = []
for i in range(len(validate_bunch.news_id)):
    bayes_text.append('NULL')

label_predict = predict_lr
bayes_result = []
for i in range(len(validate_bunch.news_id)):
    bayes_result.append(validate_bunch.news_id[i]+'\t'+label_predict[i]+'\t'+bayes_text[i]+'\t'+bayes_text[i])

In [73]:
from instrument import save_text

save_path = './submittion/result_knn.txt'
save_text(save_path, bayes_result)

### 3、Logistic Regression Classifier

In [25]:
# Logistic Regression Classifier, speed too much time
def classifier_logistic_regression(x_data, y_labels):
    from sklearn.linear_model import LogisticRegressionCV
    # lbfgs newton-cg sag
    model = LogisticRegressionCV(Cs=5,max_iter=500,multi_class='multinomial',class_weight='balanced',n_jobs=-1)
    model.fit(x_data, y_labels)
    return model

model_lr = classifier_logistic_regression(x_train, y_train)
print('The accuracy of classifying training data with Logistic Regression is :',
      model_lr.score(x_test, y_test))
print(classification_report(y_test, model_lr.predict(x_test)))

The accuracy of classifying training data with Logistic Regression is : 0.6747799779977998
             precision    recall  f1-score   support

          0       0.74      0.69      0.72      7192
          1       0.40      0.47      0.43      2357
          2       0.73      0.75      0.74      4995

avg / total       0.68      0.67      0.68     14544



### 验证集预测

In [65]:
model_lr = classifier_knn(tfidf_train.tdm,tfidf_train.Label)
predict_lr = model_lr.predict(tfidf_validate.tdm)

In [66]:
# 7. store the result of predict to local, and ust it to submittion
bayes_text = []
for i in range(len(validate_bunch.news_id)):
    bayes_text.append('NULL')

label_predict = predict_lr
bayes_result = []
for i in range(len(validate_bunch.news_id)):
    bayes_result.append(validate_bunch.news_id[i]+'\t'+label_predict[i]+'\t'+bayes_text[i]+'\t'+bayes_text[i])

In [68]:
from instrument import save_text

save_path = './submittion/result_lr.txt'
save_text(save_path, bayes_result)

## 4、Random Forest

In [64]:
# Random Forest Classifier
def classifier_random_forest(x_data, y_labels):
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators=60,max_depth=30,max_features=0.8,n_jobs=-1)
    model.fit(x_data, y_labels)
    return model

model_random_forest = classifier_random_forest(x_train, y_train)
print('The accuracy of classifying training data with Random Forest is :',
      model_random_forest.score(x_test, y_test))
print(classification_report(y_test,model_random_forest.predict(x_test)))

The accuracy of classifying training data with Random Forest is : 0.661991199119912
             precision    recall  f1-score   support

          0       0.67      0.82      0.74      7192
          1       0.53      0.16      0.25      2357
          2       0.67      0.68      0.67      4995

avg / total       0.65      0.66      0.63     14544



## 5、Decision Tree

In [71]:
# Decision Tree Classifier
def classifier_decision_tree(x_data, y_labels):
    from sklearn.tree import DecisionTreeClassifier
    model = DecisionTreeClassifier()
    model.fit(x_data, y_labels)
    return model

model_decision_tree = classifier_decision_tree(x_train, y_train)
print('The accuracy of classifying training data with Decision Tree is :',
      model_decision_tree.score(x_test, y_test))
print(classification_report(y_test, model_decision_tree.predict(x_test)))

The accuracy of classifying training data with Decision Tree is : 0.5731573157315731
             precision    recall  f1-score   support

          0       0.65      0.67      0.66      7192
          1       0.28      0.24      0.26      2357
          2       0.58      0.59      0.58      4995

avg / total       0.56      0.57      0.57     14544



## 6、GBDT

In [None]:
# GBDT(Gradient Boosting Decision Tree) Classifier
def calssifier_gradient_boosting(x_data, y_labels):
    from sklearn.ensemble import GradientBoostingClassifier
    model = GradientBoostingClassifier(n_estimators=200)
    model.fit(x_data, y_labels)
    return model

model_gbdt = calssifier_gradient_boosting(x_train, y_train)
print('The accuracy of classifying training data with GBDT is :',
      model_gbdt.score(x_test, y_test))
print(classification_report(y_test,model_gbdt.predict(x_test)))

In [None]:
model_gbdt = calssifier_gradient_boosting(tfidf_train.tdm,tfidf_train.Label)
predict_gbdt = model_gbdt.predict(tfidf_validate.tdm)

In [None]:
# 7. store the result of predict to local, and ust it to submittion
bayes_text = []
for i in range(len(validate_bunch.news_id)):
    bayes_text.append('NULL')

label_predict = predict_gdbt
bayes_result = []
for i in range(len(validate_bunch.news_id)):
    bayes_result.append(validate_bunch.news_id[i]+'\t'+label_predict[i]+'\t'+bayes_text[i]+'\t'+bayes_text[i])

In [None]:
from instrument import save_text

save_path = './submittion/result_gbdt.txt'
save_text(save_path, bayes_result)

## 7、Adaboost

In [74]:
# AdaBoost
def classifier_adaboost(x_data,y_labels):
    from sklearn.ensemble import AdaBoostClassifier
    model = AdaBoostClassifier()
    model.fit(x_data, y_labels)
    return model

model_adaboost = classifier_adaboost(x_train,y_train)
print('The accuracy of classifying training data with AdaBoost is :',
      model_adaboost.score(x_test, y_test))
print(classification_report(y_test,model_adaboost.predict(x_test)))

The accuracy of classifying training data with AdaBoost is : 0.6330445544554455
             precision    recall  f1-score   support

          0       0.63      0.82      0.71      7192
          1       0.47      0.19      0.27      2357
          2       0.67      0.57      0.62      4995

avg / total       0.62      0.63      0.61     14544



In [78]:
model_adaboost = classifier_adaboost(tfidf_train.tdm,tfidf_train.Label)
predict_adaboost = model_adaboost.predict(tfidf_validate.tdm)

In [79]:
# 7. store the result of predict to local, and ust it to submittion
bayes_text = []
for i in range(len(validate_bunch.news_id)):
    bayes_text.append('NULL')

label_predict = predict_adaboost
bayes_result = []
for i in range(len(validate_bunch.news_id)):
    bayes_result.append(validate_bunch.news_id[i]+'\t'+label_predict[i]+'\t'+bayes_text[i]+'\t'+bayes_text[i])

In [80]:
from instrument import save_text

save_path = './submittion/result_adaboost.txt'
save_text(save_path, bayes_result)

## 8、SVM

In [75]:
# SVM(Support Vector Machine) Classifier
def classifier_svm(x_data, y_labels):
    from sklearn.svm import SVC
    model = SVC(kernel='rbf', probability=True)
    model.fit(x_data, y_labels)
    return model

model_svm = classifier_svm(x_train, y_train)
print('The accuracy of classifying training data with SVM classifier',
      model_svm.score(x_test, y_test))
print(classification_report(y_test, model_svm.predict(x_test)))

The accuracy of classifying training data with SVM classifier 0.4944994499449945
             precision    recall  f1-score   support

          0       0.49      1.00      0.66      7192
          1       0.00      0.00      0.00      2357
          2       0.00      0.00      0.00      4995

avg / total       0.24      0.49      0.33     14544



  'precision', 'predicted', average, warn_for)


## 9、xgboost

In [76]:
# XGBoost(eXtreme Gradient Boosting) Classifier
def classifier_xgboost(x_data, y_labels):
    from xgboost import XGBClassifier
    model = XGBClassifier(max_depth=8, learning_rate=0.5, min_child_weight=1,
                          scale_pos_weight=1, n_estimators=1000, reg_lambda=4,
                          objective='multi:softmax', num_class=3, eval_metric='merror')
    model.fit(x_data, y_labels)
    return model

model_xgboost = classifier_xgboost(x_train, y_train)
print('The accuracy of classifying training data with XGBoost Classifier',
      model_xgboost.score(x_test, y_test))
print(classification_report(y_test, model_xgboost.predict(x_test)))

  if diff:


The accuracy of classifying training data with XGBoost Classifier 0.6817931793179318
             precision    recall  f1-score   support

          0       0.70      0.79      0.74      7192
          1       0.46      0.25      0.32      2357
          2       0.71      0.73      0.72      4995

avg / total       0.66      0.68      0.67     14544



  if diff:


In [None]:
model_xgboost = calssifier_xgboost(tfidf_train.tdm,tfidf_train.Label)
predict_xgboost = model_xgboost.predict(tfidf_validate.tdm)