In [1]:
import pandas as pd
import numpy as np
import re
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

def preprocess(contents):
    new_contents=[]
    for content in contents:
        content = ''.join(re.findall(r'[\d|\w]+', content))
        content = ' '.join(jieba.cut(content))
        new_contents.append(content)
    return new_contents

def convert(data):
    if data['source'] == '新华社':
        return 1
    else:
        return 0


fname = '../lesson05/sqlResult_1558435.csv'
database = pd.read_csv(fname, encoding='gb18030', usecols=['source', 'content'])
database = database.fillna('')

database['xinhua'] = database.apply(convert, axis=1)

contents = database['content'].tolist()
new_contents = preprocess(contents)

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(new_contents)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\DINGLI~1\AppData\Local\Temp\jieba.cache
Loading model cost 1.020 seconds.
Prefix dict has been built succesfully.


In [2]:
X.shape

(89611, 268668)

In [3]:
y = database['xinhua']
y.shape

(89611,)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [5]:
from time import time
from sklearn import metrics


def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    precision = metrics.precision_score(y_test, pred)
    print("precision_score:   %0.3f" % precision)

    recall = metrics.recall_score(y_test, pred)
    print("recall_score:   %0.3f" % recall)
    
    f1 = metrics.f1_score(y_test, pred)
    print("f1_score:   %0.3f" % f1)


## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
benchmark(LogisticRegression())

________________________________________________________________________________
Training: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)




train time: 3.377s
test time:  0.013s
precision_score:   0.976
recall_score:   0.997
f1_score:   0.986


## SVM

In [8]:
from sklearn.svm import LinearSVC
benchmark(LinearSVC(C=1, loss="hinge"))

________________________________________________________________________________
Training: 
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)
train time: 11.628s
test time:  0.016s
precision_score:   0.991
recall_score:   0.995
f1_score:   0.993




## Bayers

In [10]:
from sklearn.naive_bayes import MultinomialNB
benchmark(MultinomialNB())

________________________________________________________________________________
Training: 
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
train time: 0.117s
test time:  0.046s
precision_score:   0.885
recall_score:   1.000
f1_score:   0.939


## Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
benchmark(DecisionTreeClassifier(max_depth=3))

________________________________________________________________________________
Training: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
train time: 17.735s
test time:  0.172s
precision_score:   0.995
recall_score:   0.994
f1_score:   0.994


## Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
benchmark(RandomForestClassifier())

________________________________________________________________________________
Training: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)




train time: 35.289s
test time:  0.448s
precision_score:   0.953
recall_score:   0.994
f1_score:   0.973
