In [41]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [42]:
def _remove_noise(document):
    """去除和语种信息无关的一些噪音"""
    noise_pattern = re.compile("|".join(["http\S+", "\@\w+", "\#\w+", '\d+']))
    clean_text = re.sub(noise_pattern, "", document)
    return clean_text

In [43]:
# _remove_noise函数测试
_remove_noise("1234 Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html")

' Trump images are now more popular than cat gifs.   '

In [44]:
class LanguageDetector:
    def __init__(self, classifier=MultinomialNB()):
        self.classifier = classifier
        self.vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000, preprocessor=_remove_noise)

    def features(self, X):
        return self.vectorizer.transform(X)

    def fit(self, X, y):
        self.vectorizer.fit(X)
        self.classifier.fit(self.features(X), y)

    def predict(self, x):
        return self.classifier.predict(self.features([x]))

    def score(self, X, y):
        return self.classifier.score(self.features(X), y)

In [45]:
in_f = open('data.csv', encoding='UTF-8')
lines = in_f.readlines() # 返回值为列表
in_f.close()

In [46]:
dataset = [(line.strip()[:-3], line.strip()[-2:]) for line in lines]
x, y = zip(*dataset)
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [47]:
language_detector = LanguageDetector()
language_detector.fit(x_train, y_train)

In [48]:
print(language_detector.predict('This is an English sentence'))
print(language_detector.score(x_test, y_test))

['en']
0.9788266431407145


In [49]:
language_detector.vectorizer.get_feature_names() # CountVectorizer的所提取到的特征

['aan',
 'ab',
 'abbiamo',
 'aber',
 'able',
 'able to',
 'about',
 'account',
 'ad',
 'adam',
 'admits',
 'admitted',
 'af',
 'afp',
 'after',
 'against',
 'ahora',
 'ai',
 'airport',
 'al',
 'alaska',
 'algerie',
 'algo',
 'all',
 'alla',
 'alle',
 'alles',
 'als',
 'also',
 'altri',
 'am',
 'am besten',
 'americanchopper',
 'amsterdam',
 'an',
 'anche',
 'ancora',
 'and',
 'android',
 'ans',
 'any',
 'apoyo',
 'app',
 'apps',
 'appwednesday',
 'après',
 'aquí',
 'are',
 'as',
 'assange',
 'at',
 'at ep',
 'at least',
 'at pm',
 'at the',
 'au',
 'auch',
 'auf',
 'aujourdhui',
 'aus',
 'aussi',
 'auto',
 'aux',
 'avant',
 'avec',
 'avete',
 'ayudamos',
 'ayudar',
 'año',
 'back',
 'bbc',
 'be',
 'bedankt',
 'bedankt voor',
 'been',
 'bei',
 'bei uns',
 'beim',
 'believes',
 'ben',
 'bene',
 'berlin',
 'berlusconi',
 'best',
 'beste',
 'besten',
 'between',
 'bien',
 'bij',
 'bij het',
 'bin',
 'bitte',
 'blog',
 'bon',
 'bonne',
 'boss',
 'brand',
 'buenas',
 'buenas tardes',
 'bueno