In [1]:
import numpy as np
import pandas as pd

In [2]:
train_tsv = pd.read_csv('./data/train.tsv', sep='\t')
x_train = train_tsv['Phrase']
y_train = np.array(train_tsv['Sentiment'])

In [3]:
from collections import Counter
from scipy.sparse import csr_matrix
import re

class Vectorizer(object):
    def __init__(self, ngram_range=(1,1), max_tf=1.0, use_tfidf=False, max_features=50000):
        self.ngram_range = ngram_range
        self.max_tf = max_tf
        self.use_tfidf = use_tfidf
        self.max_features = max_features
        
        self.vocabulary_ = dict()
        self.counter_ = Counter()
        self.df_ = []
        self.idf_ = []
        

    def tokenize(self, text):
        return list(filter(None, re.split('[\W]', text.lower())))
    
    def get_ngrams(self, tokens):
        ngrams = []
        minlen = self.ngram_range[0]
        maxlen = self.ngram_range[1]
        for length in range(minlen, maxlen + 1):
            for i in range(0, len(tokens) - length + 1):
                ngrams.append(' '.join(tokens[i:i+length]))
        return ngrams
    
    def fit(self, raw_documents):
        self.counter_.clear()
        counter_list = []

        for doc in raw_documents:
            tokens = self.tokenize(doc)
            ngrams = self.get_ngrams(tokens)
            counter = Counter(ngrams)
            self.counter_.update(counter)
            counter_list.append(counter)
            
        kv = sorted(self.counter_.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)[:self.max_features]
        self.counter_ = Counter(dict(sorted(kv, key = lambda kv:(kv[0], kv[1]))))
        self.vocabulary_ = dict([(k, i) for i, k in enumerate(self.counter_.keys())])
        self.df_ = [0] * len(self.vocabulary_)
        for counter in counter_list:
            for key in counter.keys():
                if key in self.vocabulary_:
                    self.df_[self.vocabulary_[key]] += 1
        self.df_ = np.array(self.df_)
        self.idf_ = np.log((1.0 + len(counter_list))/ (1.0 + self.df_)) + 1.0
    
    def transform(self, raw_documents):
        
        indptr = [0]
        indices = []
        data = []
        
        for doc in raw_documents:
            tokens = self.tokenize(doc)
            ngrams = self.get_ngrams(tokens)
            counter = Counter(ngrams)
            
            val = []
            
            for k, v in counter.items():
                if k in self.vocabulary_:
                    idx = self.vocabulary_[k]
                    indices.append(idx)
                
                    if self.use_tfidf:
                        val.append(v / sum(counter.values()) * self.idf_[idx])
                    else:
                        val.append(v)
            
            if self.use_tfidf:
                val = np.array(val)
                val /= np.sqrt((val**2).sum())
                val = val.tolist()
                
            data += val
            
            indptr.append(len(data))
            
        return csr_matrix((data, indices, indptr), shape=(len(raw_documents), len(self.vocabulary_)))
    
    def fit_transform(self, raw_documents):
        self.fit(raw_documents)
        return self.transform(raw_documents)

In [4]:
counter = Vectorizer(ngram_range=(1,1), use_tfidf=False)
train_counts = counter.fit_transform(x_train)

In [5]:
tfidf = Vectorizer(ngram_range=(2,3), use_tfidf=True, max_features=50000)
train_features = tfidf.fit_transform(x_train)

In [6]:
from scipy.sparse import hstack
train_ft = hstack([train_counts, train_features]).tocsr()
print(train_ft.shape)

(156060, 65275)


In [7]:
test_tsv = pd.read_csv('./data/test.tsv', sep='\t')
x_test = test_tsv['Phrase']
test_counts = counter.transform(x_test)
test_features = tfidf.transform(x_test)
test_ft = hstack([test_counts, test_features]).tocsr()

In [8]:
def train_test_split(x, y, test_size=0.1, shuffle=True):
    num = x.shape[0]
    idx = np.arange(0, num)
    if shuffle:
        np.random.shuffle(idx)
    num_test = int(test_size * num)
    num_train = num - num_test
    return x[idx[:num_train]], x[idx[num_train:]],\
            y[idx[:num_train]], y[idx[num_train:]]

In [9]:
train_ft, val_ft, train_label, val_label = train_test_split(train_ft, y_train)

In [10]:
def get_minibatch(data, batch_size=32, shuffle=True):
    is_list = type(data) is list
    if is_list:
        num = data[0].shape[0] if type(data[0]) is csr_matrix else len(data[0])
    else:
        num = data.shape[0] if type(data) is csr_matrix else len(data)
    idx = np.arange(num)
    if shuffle:
        np.random.shuffle(idx)
    for left in np.arange(0, num, batch_size):
        batch_idx = idx[left:left+batch_size]
        yield [d[batch_idx] for d in data] if is_list else data[batch_idx]

In [11]:
import numpy as np

def to_onehot(x, num_classes):
    return np.eye(num_classes)[x]

def softmax(x):
    return np.exp(x) / np.exp(x).sum(-1, keepdims=True)

class SoftmaxRegression(object):
    
    def __init__(self, num_features, num_classes, lr=1e-3, weight_decay=0.0):
        self.w = np.random.uniform(size=(num_features, num_classes))
        self.num_features = num_features
        self.num_classes  = num_classes
        self.lr = lr
        self.weight_decay = weight_decay
    
    def predict_prob(self, x):
        probs = softmax(x.dot(self.w))
        return probs
    
    def predict(self, x):
        probs = self.predict_prob(x)
        return probs.argmax(-1)
    
    def gradient_descent(self, x, y):
        probs = self.predict_prob(x)
        gradients = x.transpose().dot(probs - to_onehot(y, self.num_classes))
        self.w -= self.lr * (gradients + self.weight_decay * self.w)

In [12]:
model = SoftmaxRegression(train_ft.shape[1], 5, lr=1e-3, weight_decay=1e-2)

In [13]:
for epoch in range(300):
    for x, y in get_minibatch([train_ft, train_label], batch_size=256):
        model.gradient_descent(x,y)
    
    if (epoch + 1) % 10 == 0:
        num_correct = 0
        for x, y in get_minibatch([val_ft, val_label], batch_size=256, shuffle=False):
            num_correct += (model.predict(x) == y).sum()
        print('Epoch {}: Accuracy = {}%'.format(epoch + 1, 100.0 * num_correct / len(val_label)))

Epoch 10: Accuracy = 39.5040369088812%
Epoch 20: Accuracy = 43.60502370882994%
Epoch 30: Accuracy = 46.46930667691913%
Epoch 40: Accuracy = 47.99436114315007%
Epoch 50: Accuracy = 49.45533769063181%
Epoch 60: Accuracy = 50.46136101499423%
Epoch 70: Accuracy = 51.31359733435858%
Epoch 80: Accuracy = 51.94796872997565%
Epoch 90: Accuracy = 52.5438933743432%
Epoch 100: Accuracy = 53.08214789183647%
Epoch 110: Accuracy = 53.37690631808279%
Epoch 120: Accuracy = 53.71011149557862%
Epoch 130: Accuracy = 54.395745226195054%
Epoch 140: Accuracy = 54.555940023068054%
Epoch 150: Accuracy = 54.64564910931693%
Epoch 160: Accuracy = 54.97885428681276%
Epoch 170: Accuracy = 55.164680251185445%
Epoch 180: Accuracy = 55.4081763424324%
Epoch 190: Accuracy = 55.5683711393054%
Epoch 200: Accuracy = 55.65167243367935%
Epoch 210: Accuracy = 55.74778931180315%
Epoch 220: Accuracy = 55.86953735742663%
Epoch 230: Accuracy = 56.10662565679867%
Epoch 240: Accuracy = 55.99769319492503%
Epoch 250: Accuracy = 56.1

In [14]:
y_test = []
for x in get_minibatch(test_ft, batch_size=256, shuffle=False):
    y_test += model.predict(x).tolist()
    
test_tsv['Sentiment'] = y_test
test_tsv[['PhraseId', 'Sentiment']].to_csv('numpy_result.csv', index=False)

lr=3e-3, weight_decay=1e-3: 0.49380

lr=3e-3, weight_decay=1e-2: 0.50517

lr=1e-3, weight_decay=1e-2: 0.50526