In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import scipy
from scipy.sparse import csr_matrix
from scipy.special import logsumexp, expit
import time
import matplotlib.pyplot as plt
from scipy import sparse
from matplotlib.ticker import MaxNLocator

In [0]:
class BaseSmoothOracle:

    def func(self, w):
        if(sparse.issparse(self.X)):
            if(w.ndim == 1):
                return np.sum(logsumexp(
                    np.vstack((
                            -1 * np.asarray(self.X.dot(w)) * self.y,
                            np.zeros(np.size(self.y))
                            )).T,
                    axis=1
                    )) / np.size(self.y) + self.l2_coef * np.dot(w, w) / 2
            else:
                max_val = np.amax(np.asarray(self.X.dot(w.T)), axis=1)
                return -1 * np.sum(
                    np.squeeze(np.asarray(
                            self.X.multiply(w[self.y.astype(int)]).sum(axis=1)), axis=1) -
                    max_val -
                    logsumexp(
                            np.asarray(self.X.dot(w.T)) -
                            max_val[:, np.newaxis],
                            axis=1)
                    ) / np.size(self.y) + self.l2_coef * np.sum(w * w) / 2
        else:
            if(w.ndim == 1):
                return np.sum(logsumexp(
                        np.vstack((
                                -1 * np.dot(self.X, w) * self.y,
                                np.zeros(np.size(self.y))
                                )).T,
                        axis=1
                        )) / np.size(self.y) + self.l2_coef * np.dot(w, w) / 2
            else:
                max_val = np.amax(np.dot(self.X, w.T), axis=1)
                return -1 * np.sum(
                        np.sum(self.X * w[self.y.astype(int)], axis=1) - max_val -
                        logsumexp(
                                np.dot(self.X, w.T) -
                                max_val[:, np.newaxis],
                                axis=1)
                        ) / np.size(self.y) + self.l2_coef * np.sum(w * w) / 2

    def grad(self, w):
        if(self.X.dtype == int):
            min_val = np.iinfo(self.X.dtype).min
            max_val = np.iinfo(self.X.dtype).max
        elif(self.X.dtype == float):
            min_val = np.finfo(self.X.dtype).min
            max_val = np.finfo(self.X.dtype).max
        if(sparse.issparse(self.X)):
            if(w.ndim == 1):
                arg = np.asarray(self.X.dot(w)) * self.y
                return np.squeeze(np.asarray(
                    self.X.multiply(self.y[:, np.newaxis]).multiply(
                        -1 * (np.clip(
                                    np.exp(-1 * arg),
                                    min_val, max_val
                            ) * expit(arg))[:, np.newaxis]
                    ).sum(axis=0)), axis=0) / \
                    np.size(self.y) + self.l2_coef * w
            else:
                mask = np.arange(np.size(w, 0))
                mask = mask[:, np.newaxis] == self.y[np.newaxis, :]
                max_arg = np.amax(np.asarray(self.X.dot(w.T)), axis=1)
                arg = np.asarray(self.X.dot(w.T)) - max_arg[:, np.newaxis]
                return self.X.transpose().dot( 
                    -1 * mask.T +
                    np.clip(np.exp(arg), min_val, max_val) /
                    np.clip(
                        np.sum(np.exp(arg), axis=1),
                        min_val, max_val)[:, np.newaxis]
                ).T / np.size(self.y) + self.l2_coef * w
        else:
            if(w.ndim == 1):
                arg = np.dot(self.X, w) * self.y
                return np.sum(
                        -1 * (np.clip(
                                np.exp(-1 * arg),
                                min_val, max_val
                        ) * expit(arg))[:, np.newaxis] *
                        self.X * self.y[:, np.newaxis],
                        axis=0
                        ) / np.size(self.y) + self.l2_coef * w
            else: 
                mask = np.arange(np.size(w, 0))
                mask = mask[:, np.newaxis] == self.y[np.newaxis, :]
                max_arg = np.amax(np.dot(self.X, w.T), axis=1)
                arg = np.dot(self.X, w.T) - max_arg[:, np.newaxis]
                return np.dot(
                    self.X.T, 
                    -1 * mask.T + 
                    np.clip(np.exp(arg), min_val, max_val) /
                    np.clip(
                        np.sum(np.exp(arg), axis=1),
                        min_val, max_val
                    )[:, np.newaxis]).T / np.size(self.y) + self.l2_coef * w
                


class BinaryLogistic(BaseSmoothOracle):

    def __init__(self, l2_coef):
        self.l2_coef = l2_coef

    def func(self, X, y, w):
        self.X = X
        self.y = y
        return super().func(w)

    def grad(self, X, y, w):
        self.X = X
        self.y = y
        return super().grad(w)


class MulticlassLogistic(BaseSmoothOracle):

    def __init__(self, l2_coef):
        self.l2_coef = l2_coef

    def func(self, X, y, w):
        self.X = X
        self.y = y
        return super().func(w)

    def grad(self, X, y, w):
        self.X = X
        self.y = y
        return super().grad(w)


In [0]:
class GDClassifier:

    def __init__(self, loss_function, step_alpha=0.1, step_beta=1,
                 tolerance=1e-5, max_iter=1000, **kwargs):
        self.loss_function = loss_function
        self.step_alpha = step_alpha
        self.step_beta = step_beta
        self.tolerance = tolerance
        self.max_iter = max_iter
        self.kwargs = kwargs

    def fit(self, X, y, X_test=np.zeros(1), y_test=np.zeros(1), w_0=None, trace=False):
        if(self.loss_function == 'binary_logistic'):
            if(w_0 is None):
                w_0 = np.zeros(np.size(X, 1))
            self.lr = BinaryLogistic(**self.kwargs)
        elif(self.loss_function == 'multinomial_logistic'):
            if(w_0 is None):
                w_0 = np.zeros((np.size(np.unique(y)), np.size(X, 1)))
            self.lr = MulticlassLogistic(**self.kwargs)
        self.w = w_0.copy()
        last_func = self.lr.func(X, y, self.w)
        curr_func = last_func
        if(trace):
            self.history = dict()
            self.history['time'] = [0.0]
            self.history['func'] = [last_func]
            self.history['acc'] = [np.sum(np.equal(y_test, self.predict(X_test))) / np.size(y_test)]
            start = time.time()
        num_iter = 0
        while(num_iter == 0 or
              (np.abs(curr_func - last_func) >= self.tolerance and
               num_iter < self.max_iter)):
            num_iter += 1
            self.w -= self.lr.grad(X, y, self.w) * \
                self.step_alpha / num_iter ** self.step_beta
            last_func = curr_func
            curr_func = self.lr.func(X, y, self.w)
            if(trace):
                end = time.time()
                self.history['time'].append(end - start)
                self.history['func'].append(curr_func)
                self.history['acc'].append(np.sum(np.equal(y_test, self.predict(X_test))) / np.size(y_test))
        if(trace):
            return self.history

    def predict(self, X):
        if(self.loss_function == 'binary_logistic'):
            if(sparse.issparse(X)):
                return np.sign(np.asarray(X.dot(self.w.T)))
            else:
                return np.sign(np.dot(X, self.w.T))
        else:
            if(sparse.issparse(X)):
                return np.argmax(np.asarray(X.dot(self.w.T)), axis=1)
            else:
                return np.argmax(np.dot(X, self.w.T), axis=1)


    def predict_proba(self, X):
        if(self.loss_function == 'binary_logistic'):
            if(sparse.issparse(X)):
                return expit(np.asarray(X.dot(self.w.T)))
            else:
                return expit(np.dot(X, self.w.T))
        elif(self.loss_function == 'multinomial_logistic'):
            if(sparse.issparse(X)):
                softmax = np.exp(np.asarray(X.dot(self.w.T)) -
                                 np.amax(
                                         np.asarray(X.dot(self.w.T)),
                                         axis=1
                                         )[:, np.newaxis]
                                 )
            else:
                softmax = np.exp(np.dot(X, self.w.T) -
                                 np.amax(
                                         np.dot(X, self.w.T),
                                         axis=1
                                         )[:, np.newaxis]
                                 )
            return softmax / np.sum(softmax, axis=1)[:, np.newaxis]

    def get_objective(self, X, y):
        return self.lr.func(X, y, self.w)

    def get_gradient(self, X, y):
        return self.lr.grad(X, y, self.w)

    def get_weights(self):
        return self.w


class SGDClassifier(GDClassifier):

    def __init__(self, loss_function, batch_size, step_alpha=1, step_beta=0,
                 tolerance=1e-5, max_iter=100000, random_seed=153, **kwargs):
        GDClassifier.__init__(self, loss_function=loss_function, step_alpha=step_alpha, step_beta=step_beta,
                             tolerance=tolerance, max_iter=max_iter, **kwargs)
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.step_alpha = step_alpha
        self.step_beta = step_beta
        self.tolerance = tolerance
        self.max_iter = max_iter
        self.random_seed = random_seed
        self.kwargs = kwargs

    def fit(self, X, y, X_test=np.zeros(1), y_test=np.zeros(1), w_0=None, trace=False, log_freq=1):
        np.random.seed(self.random_seed)
        if(self.loss_function == 'binary_logistic'):
            if(w_0 is None):
                w_0 = np.zeros(np.size(X, 1))
            self.lr = BinaryLogistic(**self.kwargs)
        elif(self.loss_function == 'multinomial_logistic'):
            if(w_0 is None):
                w_0 = np.zeros((np.size(np.unique(y)), np.size(X, 1)))
            self.lr = MulticlassLogistic(**self.kwargs)
        self.w = w_0.copy()
        last_func = self.lr.func(X, y, self.w)
        curr_func = last_func
        if(trace):
            self.history = dict()
            self.history['epoch_num'] = [0.0]
            self.history['time'] = [0.0]
            self.history['func'] = [last_func]
            self.history['weights_diff'] = [0.0]
            self.history['acc'] = [np.sum(np.equal(y_test, self.predict(X_test))) / np.size(y_test)]
            start = time.time()
            last_epoch_num = 0
            curr_epoch_num = 0
            last_w = self.w.copy()
            curr_w = last_w.copy()
        num_iter = 0
        ind_list = np.arange(np.size(X, 0))
        np.random.shuffle(ind_list)
        curr_ind = 0
        while(num_iter == 0 or
              (np.abs(curr_func - last_func) >= self.tolerance and
               num_iter < self.max_iter)):
            if(curr_ind >= np.size(X, 0)):
                np.random.shuffle(ind_list)
                curr_ind = 0
            num_iter += 1
            self.w -= self.lr.grad(
                    X[curr_ind:curr_ind + self.batch_size, :], 
                    y[curr_ind:curr_ind + self.batch_size], self.w
                    ) * self.step_alpha / num_iter ** self.step_beta
            last_func = curr_func.copy()
            curr_func = self.lr.func(X, y, self.w)
            if(trace):
                if(curr_ind  + self.batch_size >= np.size(ind_list)):
                    curr_epoch_num += (np.size(ind_list) - curr_ind) / \
                        np.size(ind_list)
                else:
                    curr_epoch_num += self.batch_size / np.size(ind_list)
                if(curr_epoch_num - last_epoch_num >= log_freq):
                    end = time.time()
                    last_w = curr_w.copy()
                    curr_w = self.w
                    self.history['epoch_num'].append(curr_epoch_num)
                    self.history['time'].append(end - start)
                    self.history['func'] .append(curr_func)
                    self.history['acc'].append(np.sum(np.equal(y_test, self.predict(X_test))) / np.size(y_test))
                    self.history['weights_diff'].append(
                            np.sum(
                                    (last_w - curr_w) ** 2, 
                                    axis=-1))
                    last_epoch_num = curr_epoch_num
            curr_ind += self.batch_size
        if(trace):
            return self.history
        
    def predict(self, X):
        return super().predict(X)

In [0]:
class MulticlassStrategy:   
    def __init__(self, classifier, mode, **kwargs):
        self.classifier = classifier
        self.mode = mode
        self.kwargs = kwargs
        pass
        
        
    def fit(self, X, y):
        self.num_classes = np.size(np.unique(y))
        if(self.mode == 'one_vs_all'):
            self.w = np.zeros((self.num_classes, X.shape[1]))
            for i in range(self.num_classes):
                mask = y == i
                curr_y = 2 * mask - 1 # 1 if y == i, -1 if not
                curr_lr = self.classifier(**self.kwargs)
                curr_lr.fit(X, curr_y)
                self.w[i] = curr_lr.get_weights()
        elif(self.mode == 'all_vs_all'):
            self.w = np.zeros((self.num_classes * (self.num_classes - 1) // 2, 
                              X.shape[1]))
            num = 0
            i_list = []
            j_list = []
            for i in range(self.num_classes - 1):
                for j in range(i + 1, self.num_classes):
                    i_list.append(i)
                    j_list.append(j)
                    matr_i = y == i
                    matr_j = y == j
                    matr = np.logical_or(matr_i, matr_j)
                    curr_x = X[matr]
                    curr_y = y.copy()
                    curr_y[matr_i] = 1
                    curr_y[matr_j] = -1
                    curr_y = curr_y[matr]
                    curr_lr = self.classifier(**self.kwargs)
                    curr_lr.fit(curr_x, curr_y)
                    self.w[num] = curr_lr.get_weights() 
                    num += 1
            self.i_arr = np.array(i_list)
            self.j_arr = np.array(j_list)
        
    def predict(self, X):
        if(self.mode == 'one_vs_all'):
            if(sparse.issparse(X)):
                return np.argmax(np.asarray(X.dot(self.w.T)), axis=1)
            else:
                return np.argmax(np.dot(X, self.w.T), axis=1)
        elif(self.mode == 'all_vs_all'):
            if(sparse.issparse(X)):
                pred = np.asarray(X.dot(self.w.T))
            else:
                pred = np.dot(X, self.w.T)
            mask = pred > 0
            self.i_arr = np.broadcast_to(self.i_arr, pred.shape)
            self.j_arr = np.broadcast_to(self.j_arr, pred.shape)
            pred[mask] = self.i_arr[mask]                            
            mask = np.logical_not(mask)
            pred[mask] = self.j_arr[mask]
            return np.argmax(np.apply_along_axis(
                lambda a:np.bincount(a, minlength=self.num_classes), 
                -1, pred.astype(int)), axis=1)

In [0]:
with open('/content/gdrive/My Drive/task2/news_test.json') as data_file:    
    test = pd.read_json(data_file)
with open('/content/gdrive/My Drive/task2/news_train.json') as data_file:    
    train = pd.read_json(data_file)

In [0]:
x_test = test['text'].values
x_train = train['text'].values
y_test = test['sentiment'].values
y_train = train['sentiment'].values

In [0]:
for i in range(np.size(x_test)):
    x_test[i] = x_test[i].lower()
    x_test[i] = re.sub('[^0-9а-я]+', ' ', x_test[i])
for i in range(np.size(x_train)):
    x_train[i] = x_train[i].lower()
    x_train[i] = re.sub('[^0-9а-я]+', ' ', x_train[i])

In [0]:
y_train_mult = np.zeros(y_train.shape)
neg_matr = y_train == 'negative'
y_train_mult[neg_matr] = 0
neutral_matr = y_train == 'neutral'
y_train_mult[neutral_matr] = 1
pos_matr = y_train == 'positive'
y_train_mult[pos_matr] = 2

y_test_mult = np.zeros(y_test.shape)
neg_matr = y_test == 'negative'
y_test_mult[neg_matr] = 0
neutral_matr = y_test == 'neutral'
y_test_mult[neutral_matr] = 1
pos_matr = y_test == 'positive'
y_test_mult[pos_matr] = 2

In [0]:
vectorizer = CountVectorizer(max_df=5)
x_train_mult = vectorizer.fit_transform(x_train_file)
x_test_mult = vectorizer.transform(x_test_file)
print(x_test_mult.shape)

(1240, 39335)


In [16]:
start = time.time()
lr = GDClassifier(loss_function='multinomial_logistic', step_alpha=1, step_beta=0.1, 
            tolerance=1e-4, l2_coef=0.001)
lr.fit(x_train_mult, y_train_mult)
y_pred = lr.predict(x_test_mult)
print('multinomial', np.sum(np.equal(y_pred, y_test_mult)) / np.size(y_test_mult))
end = time.time()
print('wall time, sec:', end - start)

multinomial 0.6266129032258064
wall time, sec: 530.5039427280426


In [0]:
bin_lr = SGDClassifier
args_dict = dict()
args_dict['batch_size'] = 100
args_dict['loss_function'] = 'binary_logistic'
args_dict['max_iter'] = 1000
args_dict['step_alpha'] = 0.1
args_dict['step_beta'] = 0.1
args_dict['l2_coef'] = 1
start = time.time()
lr = MulticlassStrategy(classifier=bin_lr, mode='all_vs_all', **args_dict)
lr.fit(x_train_mult, y_train_mult)
y_pred = lr.predict(x_test_mult)
print('all vs all', np.sum(np.equal(y_pred, y_test_mult)) / np.size(y_test_mult))
end = time.time()
print('wall time, sec:', end - start)

all vs all 0.5701612903225807
wall time, sec: 78.14098167419434


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
x_train_mult = vectorizer.fit_transform(x_train_file)
x_test_mult = vectorizer.transform(x_test_file)

In [0]:
start = time.time()
lr = GDClassifier(loss_function='multinomial_logistic', step_alpha=10, step_beta=2, l2_coef=1)
lr.fit(x_train_mult, y_train_mult, w_0=np.ones((3, np.size(x_train_mult, 1))))
y_pred = lr.predict(x_test_mult)
print('multinomial', np.sum(np.equal(y_pred, y_test_mult)) / np.size(y_test_mult))
end = time.time()
print('wall time, sec:', end - start)

In [0]:
import pickle

with open('/content/gdrive/My Drive/task2/test.txt', 'rb') as data_file:    
    x_test_file = pickle.load(data_file)
    
with open('/content/gdrive/My Drive/task2/train.txt', 'rb') as data_file:    
    x_train_file = pickle.load(data_file)

In [0]:
x_test_mult.shape

(1240, 56532)

In [248]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test_mult, y_pred)

array([[ 85, 106,  24],
       [ 22, 475, 108],
       [  1, 143, 276]])

In [264]:
mask_2 = y_pred == 2
mask_0 = y_test_mult == 0
mask = np.logical_and(mask_2, mask_0)
print(np.sum(mask))
print(x_test[mask][7])

24



По данным союза предприятий автомобильной отрасли Казахстана «КазАвтоПром» за  9 месяцев 2016 года  казахстанцы приобрели 30,8 тыс. новых  легковых автомобилей на сумму $569 млн, что на 59% ниже аналогичного показателя минувшего года.
По результатам января-сентября официальными дилерами было реализовано 30 759  легковых и легких коммерческих автомобилей, что на 59% ниже аналогичного показателя минувшего года (74 911 ед.). Продажи в стоимостном выражении в отчетный период составили $569 млн., уступив 58,7% значению соответствующего периода 2015 года.
В сентябре объем продаж сократился до отметки 3 207 автомобилей против 8129 ед. в том же месяце 2015 г. (-60%).
Отрицательная динамика сбыта на автомобильном рынке республики сохраняется начиная со II квартала 2014 года. Наряду с другими сегментами казахстанского ретейла авторынок продолжает испытывать негативные последствия девальвации национальной валюты и снижения доступности кредитных ресурсов.
Казахстанский автопарк «помолодел»… 