In [2]:
import nltk
import csv

Посмотрим на данные: это отзывы о ресторанах и оценка. Будем решать многоклассовую классификацию

In [3]:
! head -n 2 train.data

Id	Sentiment	Text
0	1	Incredibly disappointing service. I mean really, really bad.\n\nWe placed an order for delivery at 6:30 pm on a Tuesday night, not the busiest night of the week, I'm sure. We were given an estimate of 30-40 minutes. After an hour my husband called to make sure our order wasn't forgotten. The young girl on the phone said that they were very busy and the driver was on his way to our house (less than a mile from the restaurant) at that time and should arrive in 10 minutes. After another 30 minutes we called back and asked to please cancel the order, after 1 1/2 hours we no longer wanted the food. The girl on the phone shouted at my husband that none of this was her fault and was reluctant to cancel our order. She wanted to charge us for food we never received!\n\nThe food is just not good enough for such poor service. If 18 year old college students can't answers phones and take simple orders don't hire them. It's simple.


In [4]:
! wc -l train.data

  102583 train.data


Считаем выборку, поделим на трейн и тест так, чтобы в x_train был raw text

In [8]:
train_file = csv.reader(open('train.data'), delimiter='\t')
next(train_file)
train_set = [x for x in train_file]

train_data, train_label = [line[2] for line in train_set], [line[1] for line in train_set]
from sklearn.cross_validation import train_test_split

x_train, x_validate, y_train, y_validate = train_test_split(train_data, train_label, test_size=0.2, random_state=0)

In [6]:
x_train[0]

'I like this location because they have a drive-thru. Even though there is almost always a long line, they get you on your way fast. The staff is friendly and competent. Also, they rarely run out of anything (other locations seem to go through their entire inventory of breakfast sandwiches and scones by 9am).\\n\\nIf you are the type that does not drink your morning coffee inside a moving vehicle, they also have comfy chairs inside and decent patio seating.  The patio faces the parking lot and drive-thru but it does have shade umbrellas so it can be very pleasant in the morning.'

In [7]:
y_train[0]

'3'

"Тупое" решение:

Посмотрим что будет, если применить самое простое решение: найти 100 самых частотных слов и использовать их в качестве признаков.

In [9]:
from collections import Counter

def create_bow_with_freq(data):
    result = Counter()
    for s in data:
        result.update(s.strip().split())
    return list(result.items())

In [10]:
train_bow = create_bow_with_freq(x_train)
print('Number of unique "words": ', len(train_bow))

('Number of unique "words": ', 484082)


In [11]:
most_frequent_word = sorted(train_bow, key=lambda x: x[1], reverse=True)[:100]
most_frequent_word[:10]

[('the', 654951),
 ('and', 492240),
 ('a', 411657),
 ('I', 385802),
 ('to', 359227),
 ('of', 248340),
 ('was', 240102),
 ('is', 184703),
 ('for', 167194),
 ('in', 162966)]

In [12]:
def make_bow_sample(bow, sample):
    for s in sample:
        s = s.strip().split()
        yield { word:word in s for word, _ in bow}

In [13]:
bow_train = [(x, y) for x, y in zip(make_bow_sample(most_frequent_word, x_train), y_train)]
bow_validate = [x for x in make_bow_sample(most_frequent_word, x_validate)]

In [15]:
bow_train[0]

({'-': False,
  'I': True,
  "I'm": False,
  "I've": False,
  'It': False,
  'My': False,
  'The': True,
  'They': False,
  'This': False,
  'We': False,
  'a': True,
  'about': False,
  'all': False,
  'also': True,
  'always': True,
  'an': False,
  'and': True,
  'are': True,
  'as': False,
  'at': False,
  'back': False,
  'be': True,
  'because': True,
  'been': False,
  'but': True,
  'by': True,
  'can': True,
  'could': False,
  "didn't": False,
  'do': False,
  "don't": False,
  'even': False,
  'food': False,
  'for': False,
  'from': False,
  'get': True,
  'go': True,
  'good': False,
  'got': False,
  'great': False,
  'had': False,
  'has': False,
  'have': True,
  'he': False,
  'here': False,
  'if': False,
  'in': True,
  'is': True,
  'it': True,
  "it's": False,
  'just': False,
  'like': True,
  'little': False,
  'love': False,
  'me': False,
  'more': False,
  'much': False,
  'my': False,
  'nice': False,
  'no': False,
  'not': True,
  'of': True,
  'on': True,


Воспользуемся наивным байесовским классификатором. 
Плюс данного классификатора - можно посмотреть какиме слова оказались наиболее полезными.

In [16]:
nb = nltk.NaiveBayesClassifier.train(bow_train)
print(nb.show_most_informative_features())
predicted = [nb.classify(o) for o in bow_validate]

Most Informative Features
                    love = True                5 : 1      =      3.5 : 1.0
                   great = True                5 : 1      =      3.1 : 1.0
                     was = False               5 : 1      =      2.7 : 1.0
                  always = True                5 : 1      =      2.6 : 1.0
                  pretty = True                3 : 1      =      2.5 : 1.0
                      no = True                1 : 5      =      2.4 : 1.0
                      he = True                1 : 4      =      2.4 : 1.0
                      to = False               4 : 1      =      2.3 : 1.0
                  didn't = True                2 : 5      =      2.3 : 1.0
                    nice = True                4 : 1      =      2.2 : 1.0
None


0

In [18]:
import numpy as np
np.array(map(float,predicted)).shape, np.array(y_validate).shape

((20509,), (20509,))

In [25]:
print 'accuracy', np.mean(np.array(map(float,predicted))== np.array(map(float, y_validate)))

accuracy 0.3787605441513482


In [26]:
import csv
import pandas as pd
from sklearn.cross_validation import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
import numpy as np
import glob

In [27]:
y_train = map(float, y_train)
y_validate = map(float, y_validate)

In [28]:
tfidf = TfidfVectorizer(encoding=u'utf-8', ngram_range=(1, 2), analyzer='word')
Xtrain = tfidf.fit_transform(x_train)
Xtest = tfidf.transform(x_validate)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


- Попробуйте LodisticRegression, LinearSVC, SGDClassifier с какой-нибудь функцией потерь.
- При обучении  SGDClassifier не забудьте поставить побольше итераций, так как это итеративный метод
- параметр class_weight='balanced' может быть полезен. Что он означает?
- Можете повариьировать так же параметры TF-IDF vectorizer
- Попробовать прологарифмировать частоты, или другое нелинейное преобразование.

SVM vs LinearSVC
(LinearSVC быстрее, но не выадет вероятностей, а лишь расстояние до решающей границы. Перевести в вероятности можно, откалибровав)

Кроме того для достаижения качества полезно логарифмировать np.log1p()

In [None]:
lr = LogisticRegression(C=1, random_state=3,  n_jobs=-1)
lr.fit(Xtrain, y_train)
lr_pr = lr.predict(Xtest)

In [35]:
print 'accuracy', np.mean(np.array(map(float,lr_pr))== np.array(map(float, y_validate)))

accuracy 0.5904724754985616


In [None]:
from sklearn.svm import LinearSVC

In [None]:
# clf = [LogisticRegression(n_jobs=-1)
# LinearSVC(C=1, loss='hinge', class_weight='balanced'),
# SGDClassifier(loss='modified_huber', class_weight='balanced', alpha=1e-2, n_iter=50, n_jobs=-1),
# SGDClassifier(loss='squared_hinge', class_weight='balanced', alpha=1e-2, n_iter=50, n_jobs=-1),
# SGDClassifier(loss='hinge',class_weight='balanced', alpha=1e-2, random_state=3, n_iter=50, n_jobs=-1)]

Какой алгоритм сработал лучше свего?

Когда обучаем многоклассовую классификацию для такой задачи, не учитываем то, что метки 1 и 2 более похожи между собой, чем 4 и 5. Как это можно было бы учесть при обучении модели?

Переходим к нейросетевым подходам

https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
архитектуры отсюда

In [37]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras.utils import np_utils
import numpy as np

Using TensorFlow backend.


In [39]:
TEXT_LENGTH = 100
VOCABULARY_SIZE = 50000
EMBEDDING_DIM = 30
DIMS = 56
MAX_FEATURES = 5000
batch_size = 32

nb_filter = 50
filter_length = 3
hidden_dims = 50
nb_epoch = 3

In [40]:
x_train[0]

'I like this location because they have a drive-thru. Even though there is almost always a long line, they get you on your way fast. The staff is friendly and competent. Also, they rarely run out of anything (other locations seem to go through their entire inventory of breakfast sandwiches and scones by 9am).\\n\\nIf you are the type that does not drink your morning coffee inside a moving vehicle, they also have comfy chairs inside and decent patio seating.  The patio faces the parking lot and drive-thru but it does have shade umbrellas so it can be very pleasant in the morning.'

In [41]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(x_train)
tokenizer

<keras.preprocessing.text.Tokenizer at 0x113f64610>

In [42]:
sequences = tokenizer.texts_to_sequences(x_train)
X_train = tokenizer.sequences_to_matrix(sequences, mode='count')
sequences = tokenizer.texts_to_sequences(x_validate)
X_test = tokenizer.sequences_to_matrix(sequences, mode='count')

In [44]:
X_train.shape, X_test.shape

((82035, 5000), (20509, 5000))

In [None]:
y_train = np_utils.to_categorical(y_train)[:, 1:]
y_test = np_utils.to_categorical(y_validate)[:, 1:]

In [48]:
model = Sequential()
model.add(Dense(64, input_shape=(MAX_FEATURES,), activation = 'relu'))
model.add(Dropout(0.1))
model.add(Dense(5, activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=nb_epoch, batch_size=batch_size,  validation_split=0.1)

Train on 73831 samples, validate on 8204 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1290a0cd0>

In [49]:
pr = model.predict(X_test)
np.mean(np.argmax(pr, axis=1) == np.argmax(y_test, axis=1))

0.5656053439953191

Теперь пробуем LSTM

In [54]:
from keras.preprocessing import sequence
sequences = tokenizer.texts_to_sequences(x_train)
X_train = sequence.pad_sequences(sequences, maxlen=TEXT_LENGTH)
sequences = tokenizer.texts_to_sequences(x_validate)
X_test = sequence.pad_sequences(sequences, maxlen=TEXT_LENGTH)

In [55]:
from keras.layers import Flatten

In [58]:
max_review_length = max([len(el) for el in X_train])
top_words = 5000

In [60]:
# embedding_vecor_length = 32
# model = Sequential()
# model.add(Embedding(50000, embedding_vecor_length, input_length=500))
# model.add(LSTM(100))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model.summary())
# model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)


embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(100))
model.add(Dense(5, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 32)           160000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 505       
Total params: 213,705
Trainable params: 213,705
Non-trainable params: 0
_________________________________________________________________
None
Train on 82035 samples, validate on 20509 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x133122ed0>

In [61]:
pr = model.predict(X_test)
np.mean(np.argmax(pr, axis=1) == np.argmax(y_test, axis=1))

0.5473694475596079

Теперь добавляем сверточный слой

In [None]:
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=3, batch_size=64)