In [1]:
# -*- coding:utf-8 -*-
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
data_f = open('language_detector.csv')
lines = data_f.readlines()
data_f.close()

dataset = [(line.strip()[:-3],line.strip()[-2:]) for line in lines]

print(dataset[:3])

[('1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme', 'nl'), ('1 mill贸n de afectados ante las inundaciones en sri lanka unicef est谩 distribuyendo ayuda de emergencia srilanka', 'es'), ('1 mill贸n de fans en facebook antes del 14 de febrero y paty miki dani y berta se tiran en paraca铆das qu茅 har铆as t煤 porunmillondefans', 'es')]


In [3]:
x,y = zip(*dataset)  #x,y为list,x包含所有句子，y包含对应标签
print(x[0])
print(y[0])

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

1 december wereld aids dag voorlichting in zuidafrika over bieten taboes en optimisme
nl


In [4]:
def remove_noise(document):
    noise_pattern = re.compile('|'.join(['http\S+','\@\w+','\#\w+']))
    clean_text = re.sub(noise_pattern,"",document)
    return clean_text.strip()
print(remove_noise("Trump images are now more popular than cat gifs. @trump #trends http://www.trumptrends.html"))


Trump images are now more popular than cat gifs.


In [5]:
vector = CountVectorizer(
    lowercase=True,
    analyzer='char_wb',  #tokenise by character ngrams
    ngram_range=(1,2),   # use ngrams of size 1 and 2
    max_features=1000,   # keep the most common 1000 ngrams
    preprocessor=remove_noise
)
vector.fit(x_train)

CountVectorizer(analyzer='char_wb', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 2),
        preprocessor=<function remove_noise at 0x00000000168B6E18>,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [6]:
vector.transform(x_train)

<6346x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 503113 stored elements in Compressed Sparse Row format>

In [7]:
len(y_train)

6346

In [8]:
classifier = MultinomialNB()
classifier.fit(vector.transform(x_train),y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
classifier.score(vector.transform(x_test),y_test)

0.97463235294117645

In [10]:
classifier.predict(vector.transform(['This is an English sentence']))

array(['en'],
      dtype='<U2')