# 数据读取并存入excel

writer = pd.ExcelWriter('news_chinese.xlsx')
    datas = NewsChinese.query.all()
    contents = []
    sources = []
    for data in datas:
        try:
            content = data.content
            source = data.source
            if content and source:
                contents.append(content)
                sources.append(source)
            else:
                continue
        except Exception as e:
            continue
    df1 = pd.DataFrame(data={'source': sources, 'content': contents})
    df1.to_excel(writer, 'Sheet1', engine='xlsxwriter')
    writer.save()
    return render_template('index.html')

In [139]:
import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets

In [53]:
news = pd.read_excel('news_chinese.xlsx')

In [38]:
def stopwords(sentence):
    with open('stopwords.txt', 'r') as f:
        stopwords = f.read()
    stopwords = [s for s in stopwords if s != '\n']
    return ' '.join([s for s in sentence if s not in stopwords])

In [47]:
contents = []
sources = []
for i, content in enumerate(news.content):
    content = jieba.lcut(content)
    content = stopwords(content)
    source = news.source[i]
    sources.append(source)
    contents.append(content)
    with open('content.txt', 'a') as f:
        f.write(content + '\n')
    with open('source.txt', 'a') as f:
        f.write(source + '\n')

In [236]:
with open('source.txt', 'r') as f:
    sources = [s.replace('\n', '') for s in f.readlines()]

In [238]:
sources = [1 if source == '新华社' else 0 for source in sources ] # 将新华社类别标签置为0
train_x, test_x, train_y, test_y = train_test_split(contents, sources, train_size=0.75, random_state=2)

In [242]:
# 预测得分
def train_model(model, MAX_FEATURES=None, NGRAM_RANGE=(1, 2), CLASS_WEIGHT=None):
    tf_vector = TfidfVectorizer(max_features=MAX_FEATURES, ngram_range=NGRAM_RANGE)
    train_x_tf = tf_vector.fit_transform(train_x)
    test_x_tf = tf_vector.transform(test_x)
    model.fit(train_x_tf, train_y) # fit model
    y_hat = model.predict(test_x_tf) # predict test_y
    s = model.score(test_x_tf, test_y)  # acc
    f1 = f1_score(test_y, y_hat) # f1_score
    precision = precision_score(test_y, y_hat) # precision_score
    recall = recall_score(test_y, y_hat)   # 召回率
    accuracy = accuracy_score(test_y, y_hat)
    res_score = {'score': s, 'f1_score':f1, 'precision_score': precision, 'recall_score': recall, 'accuracy_score': accuracy}
    return res_score

In [243]:
# 逻辑回归
model = LogisticRegression()
res_score = train_model(model, MAX_FEATURES=8000)
for k, v in res_score.items():
    print('{}: {}'.format(k, v))

score: 0.9762440839957727
f1_score: 0.9869645243438139
precision_score: 0.9777200519532421
recall_score: 0.9963854808328667
accuracy_score: 0.9762440839957727


In [246]:
# 逻辑回归，给定不同的tfidf参数
model = LogisticRegression()
res_score = train_model(model, MAX_FEATURES=5000, NGRAM_RANGE=(1, 2))
for k, v in res_score.items():
    print('{}: {}'.format(k, v))

score: 0.9773009235859027
f1_score: 0.9875403551251009
precision_score: 0.9786053486628343
recall_score: 0.9966400244361859
accuracy_score: 0.9773009235859027


In [247]:
# 逻辑回归，给定不懂的惩罚项
for i in np.linspace(0.1, 2, 5):
    model = LogisticRegression(C=i)
    res_score = train_model(model, MAX_FEATURES=5000, NGRAM_RANGE=(1, 2))
    for k, v in res_score.items():
        print('{}: {}'.format(k, v))
    print('*'*100)

score: 0.9314892248311354
f1_score: 0.963356189633562
precision_score: 0.9312458424403687
recall_score: 0.9977600162907906
accuracy_score: 0.9314892248311354
****************************************************************************************************
score: 0.970408491476359
f1_score: 0.9838190954773869
precision_score: 0.9712754874237237
recall_score: 0.9966909331568498
accuracy_score: 0.970408491476359
****************************************************************************************************
score: 0.9778063686072692
f1_score: 0.9878138009335184
precision_score: 0.9791916766706683
recall_score: 0.9965891157155221
accuracy_score: 0.9778063686072692
****************************************************************************************************
score: 0.9800578964297202
f1_score: 0.989035420140468
precision_score: 0.9816941672099905
recall_score: 0.9964872982741944
accuracy_score: 0.9800578964297202
*****************************************************************

In [248]:
# 逻辑回归，给定一定的权重，
model = LogisticRegression(C=2, class_weight='balanced')
res_score = train_model(model, MAX_FEATURES=5000, NGRAM_RANGE=(1, 2))
for k, v in res_score.items():
    print('{}: {}'.format(k, v))

score: 0.9722464733722372
f1_score: 0.9844281736619572
precision_score: 0.9972316531731522
recall_score: 0.9719492949142188
accuracy_score: 0.9722464733722372


In [250]:
# 多项式朴素贝叶斯
model = MultinomialNB()
res_score = train_model(model, MAX_FEATURES=5000, NGRAM_RANGE=(1, 2))
for k, v in res_score.items():
    print('{}: {}'.format(k, v))

score: 0.8798878831043514
f1_score: 0.9299158131803313
precision_score: 0.9822713112432738
recall_score: 0.8828590337524818
accuracy_score: 0.8798878831043514


In [251]:
# 决策树分类
model = DecisionTreeClassifier(max_depth=50, max_features=3000)
res_score = train_model(model, MAX_FEATURES=5000, NGRAM_RANGE=(1, 2))
for k, v in res_score.items():
    print('{}: {}'.format(k, v))

score: 0.9879152690346
f1_score: 0.9932937246602237
precision_score: 0.9950444467150301
recall_score: 0.9915491523698009
accuracy_score: 0.9879152690346


In [218]:
help(MultinomialNB())

Help on MultinomialNB in module sklearn.naive_bayes object:

class MultinomialNB(BaseDiscreteNB)
 |  MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
 |  
 |  Naive Bayes classifier for multinomial models
 |  
 |  The multinomial Naive Bayes classifier is suitable for classification with
 |  discrete features (e.g., word counts for text classification). The
 |  multinomial distribution normally requires integer feature counts. However,
 |  in practice, fractional counts such as tf-idf may also work.
 |  
 |  Read more in the :ref:`User Guide <multinomial_naive_bayes>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : float, optional (default=1.0)
 |      Additive (Laplace/Lidstone) smoothing parameter
 |      (0 for no smoothing).
 |  
 |  fit_prior : boolean, optional (default=True)
 |      Whether to learn class prior probabilities or not.
 |      If false, a uniform prior will be used.
 |  
 |  class_prior : array-like, size (n_classes,), optional (default=None)
 |      Prio

In [140]:
help(DecisionTreeClassifier())

Help on DecisionTreeClassifier in module sklearn.tree.tree object:

class DecisionTreeClassifier(BaseDecisionTree, sklearn.base.ClassifierMixin)
 |  DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False)
 |  
 |  A decision tree classifier.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : string, optional (default="gini")
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |  
 |  splitter : string, optional (default="best")
 |      The strategy used to choose the split at each node. Supported
 |      strategies are "best" to choose the best split and "random" to choose
 |      the best random split