In [3]:
import jieba
import pandas as pd
df_technology = pd.read_csv("./data/technology_news.csv", encoding='utf-8')
df_technology = df_technology.dropna()

df_car = pd.read_csv("./data/car_news.csv", encoding='utf-8')
df_car = df_car.dropna()

df_entertainment = pd.read_csv("./data/entertainment_news.csv", encoding='utf-8')
df_entertainment = df_entertainment.dropna()

df_military = pd.read_csv("./data/military_news.csv", encoding='utf-8')
df_military = df_military.dropna()

df_sports = pd.read_csv("./data/sports_news.csv", encoding='utf-8')
df_sports = df_sports.dropna()

technology = df_technology.content.values.tolist()[1000:21000]
car = df_car.content.values.tolist()[1000:21000]
entertainment = df_entertainment.content.values.tolist()[:20000]
military = df_military.content.values.tolist()[:20000]
sports = df_sports.content.values.tolist()[:20000]

In [5]:
stopwords = pd.read_csv("data/stopwords.txt", index_col=False, quoting=3, sep="\t", names=["stopword"], encoding="utf-8")
stopwords = stopwords.stopword.values

In [6]:
def preprocess_text(content_lines, sentences, category):
    for line in content_lines:
        try:
            segs = jieba.lcut(line)
            segs = filter(lambda x: len(x) > 1, segs)
            segs = filter(lambda x: x not in stopwords, segs)
            sentences.append((" ".join(segs), category))
        except Exception as e:
            print(line)
            continue

In [7]:
sentences = []

preprocess_text(technology, sentences, "technology")
preprocess_text(car, sentences, "car")
preprocess_text(entertainment, sentences, "entertainment")
preprocess_text(military, sentences, "military")
preprocess_text(sports, sentences, "sports")

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\wyn\AppData\Local\Temp\jieba.cache
Loading model cost 0.714 seconds.
Prefix dict has been built succesfully.


In [9]:
import random
random.shuffle(sentences)

In [10]:
for sentence in sentences[:5]:
    print(sentence[0], sentence[1])

新兵 连队 换装 每次 装备 试训 罗华平 挑大梁 装备 手里 如孙 猴子 金箍棒 关二爷 偃月 他闭 眼睛 触摸 每一处 细微 脾性 气息 那天 孤独 避雷针 内心 隐秘 孤独 无助 正像 蒲公英 摇曳 military
中新网 长春 日电 世界 职业 滑雪 巡回赛 经典 滑雪赛 首次 走出 欧洲 昨天 来到 长春 净月潭 中国 本届 经典 滑雪赛 长春 第十个 举办 经典 滑雪赛 城市 世界 余个 国家 地区 千余 选手 蜿蜒 起伏 赛道 展开 一场 精彩 林中 竞速 sports
人民网 北京 日电 邱越 近日 香港 南华早报 网站 报道 中国 建造 新一代 大型 两栖 攻击 海外 投放 中国军力 发挥 更具 主导性 作用 军事 专家 卫东 接受 央视 采访 该型 两栖 攻击 岛礁 作战 时其 作用 船坞 登陆舰 更大 航母 编队 配合 远海 作战 发挥 作用 military
据介绍 编程 小时 公开课 面向全国 中小学 开放 陆续 进驻 全国 开放性 科学实践 活动 开放 课堂 四点 课堂 全国 中小学 信息技术 课堂 预计 超过 一千万 少年儿童 机会 体验 编程 趣味 编程 technology
美女 野兽 风头 一时无两 影片 自然 显得 稍逊风骚 entertainment


In [12]:
from sklearn.model_selection import train_test_split
x, y = zip(*sentences)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1111)

In [13]:
len(x_train)

65696

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer="word", max_features=4000)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=4000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [15]:
def get_features(x):
    vec.transform(x)

In [16]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
classifier.score(vec.transform(x_test), y_test)

0.8375268277090278

In [18]:
len(x_test)

21899

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(analyzer="word", ngram_range=(1, 4), max_features=20000)
vec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 4), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
def get_features(x):
    vec.transform(x)

In [21]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
classifier.score(vec.transform(x_test), y_test)

0.876432713822549

In [22]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score
import numpy as np

In [23]:
def stratifiedkfold_cv(x, y, clf_class, shuffle=True, n_folds=5, **kwargs):
    stratified_fold = StratifiedKFold(y, n_folds=n_folds, shuffle=shuffle)
    y_pred = y[:]
    for train_index, test_index in stratified_fold:
        X_train, X_test = x[train_index], x[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [25]:
NB = MultinomialNB
print(precision_score(y, stratifiedkfold_cv(vec.transform(x), np.array(y), NB), average="macro"))

0.8813169380194305


In [26]:
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [30]:
class TextClassifier():
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 4), max_features=20000)
        
    def features(self, X):
        return self.vectorizer.transform(X)
    
    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)
        
    def predict(self, x):
        return self.classifier.predict(self.features([x]))
    
    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [31]:
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.predict("这 是 有史以来 最 大 的 一 次 军舰 演习"))
print(text_classifier.score(x_test, y_test))

['military']
0.876432713822549


In [None]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(vec.transform(x_train), y_train)
svm.score(vec.transform(x_test), y_test)

In [None]:
svm = SVC(kernel="linear")
svm.fit(vec.transform(x_train), y_train)
svm.score(vec.transform(x_test), y_test)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [3]:
class TextClassifier():
    def __init__(self, classifier=SVC(kernel="linear")):
        self.classifier = classifier
        self.vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(1, 3), max_features=120000)
        
    def featurs(self, X):
        return self.vectorizer.transform(X)
    
    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.featurs(X), y)
        
    def predict(self, x):
        return self.classifier.predict(self.featurs([x]))
    
    def score(self, X, y):
        return self.classifier.score(self.featurs(X), y)

In [None]:
text_classifier = TextClassifier()
text_classifier.fit(x_train, y_train)
print(text_classifier.predict("这 是 有史以来 最 大 的 一 次 军舰 演习"))
print(text_classifier.score(x_test, y_test))