利用朴素贝叶斯模型进行语种的判断

#### 流式处理

##### 读入数据

In [2]:
data = [(line.strip()[:-3], line.strip()[-2:]) for line in open('./data/language.csv').readlines()]

In [3]:
data[:5]

[('1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme',
  'nl'),
 ('1 millón de afectados ante las inundaciones en sri lanka unicef está distribuyendo ayuda de emergencia srilanka',
  'es'),
 ('1 millón de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paracaídas qué harías tú porunmillondefans',
  'es'),
 ('1 satellite galileo sottoposto ai test presso lesaestec nl galileo navigation space in inglese',
  'it'),
 ('10 der welt sind bei', 'de')]

##### 去处数据噪音

In [5]:
import re

def remove_noise(sent):
    pattern = re.compile("|".join(["http\S+", "\S*www\S*", "\@\w+", "\#\w+"]))
    cleaned_sent = re.sub(pattern, "", sent)
    return cleaned_sent

remove_noise("Trump images www are www.gg now more popular than cat gifs. @trump #trends http://www.trumptrends.html")

'Trump images  are  now more popular than cat gifs.   '

##### 划分训练集

In [11]:
from sklearn.model_selection import train_test_split
x, y = zip(*data)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [12]:
len(x_train)

6799

##### 在降噪数据上提取n-gram特征，这里采用1-gram和2-gram的统计特征

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer(lowercase=True,    # 取小写
                         analyzer="char_wb", # 对于n-gram策略，对word范围内的char级别进行分词
                         ngram_range=(1,2),
                         max_features=1000,
                         preprocessor=remove_noise    # 消除噪声预处理
                        )

vector.fit(x_train)

CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2),
        preprocessor=<function remove_noise at 0x10515b488>,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [14]:
vector.vocabulary_

{' ': 0,
 'i': 393,
 'o': 571,
 ' i': 19,
 'io': 409,
 'o ': 572,
 'n': 537,
 ' n': 24,
 'no': 554,
 'on': 589,
 'n ': 538,
 'h': 362,
 ' h': 18,
 'ho': 376,
 'p': 604,
 'a': 151,
 'd': 240,
 'ip': 410,
 'pa': 607,
 'ad': 157,
 'd ': 241,
 'm': 503,
 ' m': 23,
 'ma': 506,
 'a ': 152,
 'mi': 514,
 'i ': 394,
 's': 683,
 ' s': 29,
 'sa': 687,
 'c': 214,
 'e': 274,
 ' c': 13,
 'ch': 222,
 'he': 368,
 'e ': 275,
 'è': 902,
 ' è': 38,
 'è ': 903,
 'f': 305,
 't': 718,
 ' f': 16,
 'fa': 308,
 'at': 173,
 'tt': 738,
 'ta': 720,
 'u': 755,
 ' u': 31,
 'un': 770,
 ' p': 26,
 'po': 620,
 'g': 333,
 'l': 467,
 'me': 510,
 'eg': 283,
 'gl': 347,
 'li': 478,
 'r': 645,
 'sf': 692,
 'fr': 323,
 'ru': 668,
 'ut': 776,
 ' l': 22,
 'la': 470,
 'su': 707,
 'up': 772,
 'pe': 611,
 'er': 294,
 'rf': 653,
 'fi': 316,
 'ic': 397,
 'ci': 223,
 'ie': 399,
 'w': 810,
 ' w': 33,
 'wp': 825,
 'p ': 605,
 ' d': 14,
 'de': 247,
 'eu': 297,
 'ts': 737,
 'sc': 689,
 'v': 788,
 ' v': 32,
 've': 794,
 'rs': 666,
 'si'

##### 训练模型，并测试结果

In [15]:
# 采用多项式的朴素贝叶斯方法
from sklearn.naive_bayes import MultinomialNB
classifer = MultinomialNB()
classifer.fit(vector.transform(x_train), y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
classifer.score(vector.transform(x_test), y_test)

0.9770621967357741

#### 封装为类

In [18]:
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

class LanguageDetector(object):
    def __init__(self, classifer=MultinomialNB()):
        self.classifer = classifer
        self.vector = CountVectorizer(lowercase=True,analyzer="char_wb",ngram_range=(1,2),max_features=1000,preprocessor=self._remove_noise)
        
    def _remove_noise(self, sent):
        pattern = re.compile("|".join(["http\S+", "\S*www\S*", "\@\w+", "\#\w+"]))
        cleaned_sent = re.sub(pattern, "", sent)
        return cleaned_sent
    
    def features(self, X):
        return self.vector.transform(X)
    
    def fit(self, X, y):
        self.vector.fit(X)
        self.classifer.fit(self.features(X), y)
        
    def predict(self, x):
        return self.classifer.predict(self.features([x]))
    
    def score(self, x, y):
        return self.classifer.score(self.features(x), y)
        

In [19]:
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in open('./data/language.csv').readlines()]
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

['en']
0.9770621967357741
