In [7]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("dark")
plt.rcParams['figure.figsize'] = 16, 12
from tqdm import tqdm_notebook
import pandas as pd
from collections import defaultdict
TAGS_FILE_NAME = 'desktop/data/top10_tags.tsv'
DS_FILE_NAME = 'desktop/data/stackoverflow_sample_125k.tsv'

In [None]:
class LogRegressor():
    
    def __init__(self, tags=top_tags):      

        self._vocab = {}

        self._w = dict([(t, defaultdict(int)) for t in tags])

        self._b = dict([(t, 0) for t in tags])
        
        self._tags = set(tags)

    def iterate_file(self, 
                     fname=DS_FILE_NAME, 
                     top_n_train=100000, 
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16):
        
        self._loss = []
        n = 0
        
        with open(fname, 'r') as f:            
            
            for line in tqdm_notebook(f, total=total, mininterval=1):
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue                
                sentence, tags = pair
                sentence = sentence.split(' ')
                tags = set(tags.split(' '))
                
                sample_loss = 0

                for tag in self._tags:
                    y = int(tag in tags)
                    
                    z = self._b[tag] 
                    for word in sentence:
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab:
                            self._vocab[word] = len(self._vocab)
                        z += self._w[tag][self._vocab[word]] 
                        
                    sigma = 1 / (1 + np.exp(-z)) if z >= 0 else 1 - 1 / (1 + np.exp(z))
                    
                    sample_loss += -y * np.log(np.max([tolerance, sigma])) if y == 1 else \
                                   -(1 - y) * np.log(1 - np.min([1 - tolerance, sigma]))
                    
                    if n < top_n_train:
                        dLdw = y - sigma

                        for word in sentence:                        
                            self._w[tag][self._vocab[word]] -= -learning_rate * dLdw
                        self._b[tag] -= -learning_rate * dLdw
                    
                n += 1
                        
                self._loss.append(sample_loss)

In [None]:
model = LogRegressor()
model.iterate_file()

In [None]:
plt.plot(pd.Series(model._loss[:-25000]).rolling(10000).mean());

In [None]:
print('Mean of the loss function on the last 10k train samples: %0.2f' % np.mean(model._loss[-35000:-25000]))

3: 19.74

In [None]:
class LogRegressor():
    
    def __init__(self, tags=top_tags):      
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
        self._tags = set(tags)
    
    def iterate_file(self, 
                     fname=DS_FILE_NAME, 
                     top_n_train=100000, 
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16,
                     accuracy_level=0.9):

        self._loss = []
        n = 0
        accuracy = []
        with open(fname, 'r') as f:            
            for line in tqdm_notebook(f, total=total, mininterval=1):
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue                
                sentence, tags = pair
                sentence = sentence.split(' ')
                tags = set(tags.split(' '))
                
                sample_loss = 0
                predicted_tags = None
                
                for tag in self._tags:
                    y = int(tag in tags)
                    
                    z = self._b[tag] 
                    for word in sentence:
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab:
                            self._vocab[word] = len(self._vocab)
                        z += self._w[tag][self._vocab[word]] 
                        
                    sigma = 1/(1 + np.exp(-z)) if z >= 0 else 1 - 1/(1 + np.exp(z))
                    
                    sample_loss += -y*np.log(np.max([tolerance, sigma])) if y == 1 else \
                                   -(1 - y)*np.log(1 - np.min([1 - tolerance, sigma]))
                    
                    if n < top_n_train:
                        dLdw = y - sigma

                        for word in sentence:                        
                            self._w[tag][self._vocab[word]] -= -learning_rate*dLdw
                        self._b[tag] -= -learning_rate*dLdw
                    else:
                        if predicted_tags is None:
                            predicted_tags = []
                        if sigma > accuracy_level:
                            predicted_tags.append(tag)
                    
                n += 1
                                        
                self._loss.append(sample_loss)
                if predicted_tags is not None:
                    accuracy.append(len(tags.intersection(predicted_tags))/len(tags.union(predicted_tags)))
            
        return(np.mean(accuracy))

In [None]:
model = LogRegressor()
acc = model.iterate_file()
print('%0.2f' % acc)

4: 0.59

In [None]:
class LogRegressor():
    
    def __init__(self, tags=top_tags):      
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
        self._tags = set(tags)
    
    def iterate_file(self, 
                     fname=DS_FILE_NAME, 
                     top_n_train=100000, 
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16,
                     accuracy_level=0.9,
                     lmbda=0.01):

        self._loss = []
        n = 0
        accuracy = []
        with open(fname, 'r') as f:            
            for line in tqdm_notebook(f, total=total, mininterval=1):
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue                
                sentence, tags = pair
                sentence = sentence.split(' ')
                tags = set(tags.split(' '))
                
                sample_loss = 0
                predicted_tags = None
                
                for tag in self._tags:
                    y = int(tag in tags)
                    
                    z = self._b[tag] 
                    for word in sentence:
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab:
                            self._vocab[word] = len(self._vocab)
                        z += self._w[tag][self._vocab[word]] 
                        
                    sigma = 1/(1 + np.exp(-z)) if z >= 0 else 1 - 1/(1 + np.exp(z))
                    
                    sample_loss += -y*np.log(np.max([tolerance, sigma])) if y == 1 else \
                                   -(1 - y)*np.log(1 - np.min([1 - tolerance, sigma]))
                    
                    if n < top_n_train:
                        dLdw = y - sigma
                        
                        r_buf = {}
                        for word in sentence:
                            if word not in r_buf:
                                r = learning_rate*lmbda*self._w[tag][self._vocab[word]]
                                r_buf[word] = True
                            else:
                                r = 0
                            self._w[tag][self._vocab[word]] -= -learning_rate*dLdw + r
                        self._b[tag] -= -learning_rate*dLdw
                    else:
                        if predicted_tags is None:
                            predicted_tags = []
                        if sigma > accuracy_level:
                            predicted_tags.append(tag)
                    
                n += 1
                                        
                self._loss.append(sample_loss)
                if predicted_tags is not None:
                    accuracy.append(len(tags.intersection(predicted_tags))/len(tags.union(predicted_tags)))
            
        return(np.mean(accuracy))

In [None]:
model = LogRegressor()
acc = model.iterate_file()
print('%0.2f' % acc)
plt.plot(pd.Series(model._loss[:-25000]).rolling(10000).mean());

5. 0.52

In [None]:
class LogRegressor():
    
    def __init__(self, tags=top_tags):      
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
        self._tags = set(tags)
    
    def iterate_file(self, 
                     fname=DS_FILE_NAME, 
                     top_n_train=100000, 
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16,
                     accuracy_level=0.9,
                     lmbda=0.0002,
                     gamma=0.1):

        self._loss = []
        n = 0
        accuracy = []
        with open(fname, 'r') as f:            
            for line in tqdm_notebook(f, total=total, mininterval=1):
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue                
                sentence, tags = pair
                sentence = sentence.split(' ')
                tags = set(tags.split(' '))
                
                sample_loss = 0
                predicted_tags = None
                
                for tag in self._tags:
                    y = int(tag in tags)
                    
                    z = self._b[tag] 
                    for word in sentence:
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab:
                            self._vocab[word] = len(self._vocab)
                        z += self._w[tag][self._vocab[word]] 
                        
                    sigma = 1/(1 + np.exp(-z)) if z >= 0 else 1 - 1/(1 + np.exp(z))
                    
                    sample_loss += -y * np.log(np.max([tolerance, sigma])) if y == 1 else \
                                   -(1 - y) * np.log(1 - np.min([1 - tolerance, sigma]))
                    
                    if n < top_n_train:
                        dLdw = y - sigma

                        r_buf = {}
                        for word in sentence:
                            if word not in r_buf:
                                r = 2 * learning_rate * lmbda * gamma * self._w[tag][self._vocab[word]] + \
                                    learning_rate * lmbda*(1 - gamma) * np.sign(self._w[tag][self._vocab[word]])
                                r_buf[word] = True
                            else:
                                r = 0
                                
                            self._w[tag][self._vocab[word]] -= -learning_rate * dLdw + r
                        self._b[tag] -= -learning_rate * dLdw
                    else:
                        if predicted_tags is None:
                            predicted_tags = []
                        if sigma > accuracy_level:
                            predicted_tags.append(tag)
                    
                n += 1
                                        
                self._loss.append(sample_loss)
                if predicted_tags is not None:
                    accuracy.append(len(tags.intersection(predicted_tags))/len(tags.union(predicted_tags)))
                    
                return(np.mean(accuracy))

In [None]:
model = LogRegressor()
acc = model.iterate_file()
print('%0.2f' % acc)
plt.plot(pd.Series(model._loss[:-25000]).rolling(10000).mean());

7: 0.59 

In [None]:
model._vocab_inv = dict([(v, k) for (k, v) in model._vocab.items()])

for tag in model._tags:
    print(tag, ':', ', '.join([model._vocab_inv[k] for (k, v) in 
                               sorted(model._w[tag].items(), 
                                      key=lambda t: t[1], 
                                      reverse=True)[:5]]))   

8: с#

In [None]:
class LogRegressor():
    
    def __init__(self, tags=top_tags):      
        self._vocab = {}
        self._w = dict([(t, defaultdict(int)) for t in tags])
        self._b = dict([(t, 0) for t in tags])
        self._tags = set(tags)
        self._word_stats = defaultdict(int)
    
    def iterate_file(self, 
                     fname=DS_FILE_NAME, 
                     top_n_train=100000, 
                     total=125000,
                     learning_rate=0.1,
                     tolerance=1e-16,
                     accuracy_level=0.9,
                     lmbda=0.0002,
                     gamma=0.1,
                     update_vocab=True):

        self._loss = []
        n = 0
        accuracy = []
        with open(fname, 'r') as f:            
            for line in tqdm_notebook(f, total=total, mininterval=1):
                pair = line.strip().split('\t')
                if len(pair) != 2:
                    continue                
                sentence, tags = pair
                sentence = sentence.split(' ')
                tags = set(tags.split(' '))
                
                sample_loss = 0
                predicted_tags = None
                
                for ix_tag, tag in enumerate(self._tags):
                    y = int(tag in tags)
                    
                    z = self._b[tag] 
                    for word in sentence:
                        if n >= top_n_train and word not in self._vocab:
                            continue
                        if word not in self._vocab and update_vocab:
                            self._vocab[word] = len(self._vocab)
                        if word not in self._vocab:
                            continue
                        if update_vocab and ix_tag == 0 and n < top_n_train:
                            self._word_stats[self._vocab[word]] += 1
                        z += self._w[tag][self._vocab[word]] 
                        
                    sigma = 1/(1 + np.exp(-z)) if z >= 0 else 1 - 1/(1 + np.exp(z))
                    
                    sample_loss += -y*np.log(np.max([tolerance, sigma])) if y == 1 else \
                                   -(1 - y)*np.log(1 - np.min([1 - tolerance, sigma]))
                    
                    if n < top_n_train:
                        dLdw = y - sigma

                        for word in sentence:  
                            if word not in self._vocab:
                                continue
                            self._w[tag][self._vocab[word]] -= -learning_rate * dLdw \
                                + 2 * learning_rate * lmbda * gamma * self._w[tag][self._vocab[word]] \
                                + learning_rate * lmbda *(1 - gamma) * np.sign(self._w[tag][self._vocab[word]])
                        self._b[tag] -= -learning_rate * dLdw
                    else:
                        if predicted_tags is None:
                            predicted_tags = []
                        if sigma > accuracy_level:
                            predicted_tags.append(tag)
                    
                n += 1
                                        
                self._loss.append(sample_loss)
                if predicted_tags is not None:
                    accuracy.append(len(tags.intersection(predicted_tags))/len(tags.union(predicted_tags)))
            
        return(np.mean(accuracy))
    
    def filter_vocab(self, n=10000):
        keep_words = set([wid for (wid, wn) in sorted(self._word_stats.items(), 
                                                      key=lambda t: t[1], reverse=True)[:n]])
        self._vocab = dict([(k, v) for (k, v) in self._vocab.items() if v in keep_words])
        for tag in self._tags:
            self._w[tag] = dict([(k, v) for (k, v) in self._w[tag].items() if k in keep_words])

In [None]:
model = LogRegressor()
acc = model.iterate_file(update_vocab=True)
print('%0.2f' % acc)
plt.plot(pd.Series(model._loss[:-25000]).rolling(10000).mean());

In [None]:
model.filter_vocab(n=10000)

In [None]:
acc = model.iterate_file(update_vocab=False, learning_rate=0.01)
print('%0.2f' % acc)
plt.plot(pd.Series(model._loss[:-25000]).rolling(10000).mean());

In [None]:
9: 0.68

In [None]:
10: ios, php