In [132]:
import numpy as np
import pandas as pd
import re
from oracles import BinaryLogistic
from optimization import GDClassifier, SGDClassifier
train = pd.read_csv('toxic_train.csv')
test = pd.read_csv('toxic_test.csv')

### Эксперимент №1

**Условие**: произведите предварительную обработку текста. Приведите все тексты к нижнему регистру. Замените в тексте все символы, не являющиеся буквами и цифрами, на пробелы.

In [2]:
train['comment_text'] = train['comment_text'].apply(lambda x: re.sub('[^a-z1-9]', ' ', str.lower(x)))

In [17]:
train

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,explanation why the edits made under my userna...,False
1,1,d aww he matches this background colour i m s...,False
2,2,hey man i m really not trying to edit war it...,False
3,3,more i can t make any real suggestions on im...,False
4,4,you sir are my hero any chance you remember...,False
...,...,...,...
52056,159494,our previous conversation you fucking sh...,True
52057,159514,you are a mischievious pubic hair,True
52058,159541,your absurd edits your absurd edits on great...,True
52059,159546,hey listen don t you ever delete my edi...,True


### Эксперимент №2

**Условие**: преобразуйте выборку в разреженную матрицу scipy.sparse.csr_matrix, где значение x в позиции (i,j) означает, что в документе i слово j встретилось x раз. Разрешается воспользоваться конструктором sklearn.feature_extraction.text.CountVectorizer.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.001)
X = vectorizer.fit_transform(s)#.toarray()
X

<52061x3686 sparse matrix of type '<class 'numpy.int64'>'
	with 1787785 stored elements in Compressed Sparse Row format>

In [4]:
X.shape

(52061, 3686)

In [7]:
np.unique(np.array(train['is_toxic']), return_counts=True)

(array([False,  True]), array([35836, 16225]))

### Эксперимент №3

**Условие**: Исследуйте поведение градиентного спуска для задачи логистической регрессии в зависимости от следующих параметров:
* параметр размера шага step_alpha
* параметр размера шага step_beta
* начального приближения

## зависимость от alpha

In [5]:
from sklearn.model_selection import train_test_split
y = -1 * np.ones(X.shape[0])
y[train['is_toxic']] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for a in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train,  trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(2, 11,2):
    plt.plot(np.cumsum(times[i]), funcs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('loss')
plt.savefig('gr.pdf')

<Figure size 720x432 with 1 Axes>

In [29]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for a in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train,  trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(2, 11,2):
    plt.plot(np.arange(len(funcs[i])), funcs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('iteration number')
plt.ylabel('loss')
plt.savefig('gr2.pdf')

<Figure size 720x432 with 1 Axes>

In [32]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for a in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append(story['time'])
    accs.append(story['acc'])

# рисуем график
for i in range(2, 11,2):
    plt.plot(np.cumsum(times[i]), accs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr3.pdf')

<Figure size 720x432 with 1 Axes>

In [34]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for a in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train, X_test, y_test,  trace=True)
    times.append((story['time']))
    accs.append(story['acc'])

# рисуем график
for i in range(2, 11,2):
    plt.plot(np.arange(len(accs[i])), accs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr4.pdf')

<Figure size 720x432 with 1 Axes>

## зависимость от beta

In [46]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for b in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=b * 0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train,  trace=True)
    times.append(story['time'])
    funcs.append(story['func'])

# рисуем график
for i in range(0, 11, 2):
    plt.plot(np.cumsum(times[i]), funcs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('gr5.pdf')

<Figure size 720x432 with 1 Axes>

In [52]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=0,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, trace=True)
times.append(story['time'])
funcs.append(story['func'])

# рисуем график
plt.plot(np.cumsum(times[0]), funcs[0], 'b')

plt.grid(True)
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('loss.pdf')

<Figure size 720x432 with 1 Axes>

In [45]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for b in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train,  trace=True)
    times.append(story['time'])
    funcs.append(story['func'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(np.arange(len(funcs[i])), funcs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('gr6.pdf')

<Figure size 720x432 with 1 Axes>

In [42]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for b in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append(story['time'])
    accs.append(story['acc'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(np.cumsum(times[i]), accs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr7.pdf')

<Figure size 720x432 with 1 Axes>

In [None]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for b in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = clf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append(story['time'])
    accs.append(story['acc'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(np.arange(len(accs[i])), accs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr8.pdf')

## зависимость от начального приближения

In [280]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, trace=True)
times.append(story['time'])
funcs.append(story['func'])

# рисуем график
plt.plot(np.arange(len(funcs[0])), funcs[0], 'g')

plt.grid(True)
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('w_0.pdf')

<Figure size 720x432 with 1 Axes>

In [287]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, w_0 = w1, trace=True)
times.append(story['time'])
funcs.append(story['func'])

# рисуем график
plt.plot(np.arange(len(funcs[0])), funcs[0], 'g')

plt.grid(True)
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('w_0_norm.pdf')

<Figure size 720x432 with 1 Axes>

In [283]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, w_0 = w2, trace=True)
times.append(story['time'])
funcs.append(story['func'])

# рисуем график
plt.plot(np.arange(len(funcs[0])), funcs[0], 'g')

plt.grid(True)
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('w_0_r.pdf')

<Figure size 720x432 with 1 Axes>

In [260]:
w1 = np.random.normal(loc=0, scale=0.1, size=X_train.shape[1])
w2 = np.random.uniform(-0.1, 0.1, X_train.shape[1])

In [279]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, X_test, y_test, trace=True)
times.append(story['time'])
accs.append(story['acc'])

clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train, X_test, y_test, w_0 = w1,trace=True)
times.append(story['time'])
accs.append(story['acc'])

clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = clf.fit(X_train, y_train,X_test, y_test, w_0 = w2,trace=True)
times.append(story['time'])
accs.append(story['acc'])

plt.plot(np.arange(len(times[0])), accs[0], label='нулевой вектор')
plt.legend(fontsize=10)

plt.plot(np.arange(len(times[1])), accs[1], label='нормальное распределение [0,0.1]')
plt.legend(fontsize=10)

plt.plot(np.arange(len(times[2])), accs[2], label='равномерное распределение [-1,1]')
plt.legend(fontsize=10)
plt.grid(True)
plt.legend()
plt.xlabel('iteration number', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('w_ai.pdf')

<Figure size 720x432 with 1 Axes>

In [159]:
import seaborn as sns
acc = []
for a in range(11):
    clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=round(0.2,1),
            tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    acc.append(clf.fit(X_train, y_train, X_test, y_test, trace=True)['acc'][-1])
DFrame = pd.DataFrame({'acc': acc,
                       'alpha': [a for a in range(11)],
                       'beta': [0 for i in range(11)]
                      })

for b in range(1, 11):
    acc = []
    for a in range(11):
        clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=a, step_beta=round(b * 0.2,1),
            tolerance=1e-8, max_iter=50, l2_coef = 0.1)
        acc.append(clf.fit(X_train, y_train, X_test, y_test, trace=True)['acc'][-1])
    day_frame =  pd.DataFrame({'acc': acc,
                               'alpha': [a for a in range(11)],
                               'beta': [round(b * 0.2, 1) for i in range(11)]
                              })
    DFrame = DFrame.append(day_frame)
DFrame

Unnamed: 0,acc,alpha,beta
0,0.315065,0,0.0
1,0.828094,1,0.0
2,0.708560,2,0.0
3,0.729752,3,0.0
4,0.780716,4,0.0
...,...,...,...
6,0.765286,6,2.0
7,0.766630,7,2.0
8,0.769383,8,2.0
9,0.771496,9,2.0


In [160]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

DFrame = DFrame.pivot_table(index="alpha", columns ="beta", values="acc")
plt.figure(figsize=(10, 9))
plt.title('Зависимость точности от параметров step_alpha и step_beta', fontsize=14)
sns.heatmap(DFrame, annot=True, fmt='.3g');

plt.savefig('corr.pdf')

<Figure size 720x648 with 2 Axes>

### Эксперимент №4

**Условие**: исследование поведения стохастического градиентного спуска

## зависимость от размера батча

In [367]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = [0]
funcs = [0]
for k in range(1,11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=k* 100, step_alpha=10, step_beta=1,
        tolerance=1e-8, max_iter=1000, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(1,11,2):
    plt.plot(np.cumsum(times[i]),funcs[i], label='batch_size='+str(i * 100))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('loss')
plt.savefig('grr2.pdf')

<Figure size 720x432 with 1 Axes>

In [366]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = [0]
funcs = [0]
for k in range(1,11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=k* 1000, step_alpha=10, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(1,11,2):
    plt.plot(np.cumsum(times[i]), funcs[i], label='batch_size='+str(i * 1000))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('loss')
plt.show()

<Figure size 720x432 with 1 Axes>

In [17]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = [0]
funcs = [0]
for k in range(1,11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=k* 1000, step_alpha=10, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, trace=True)
    epochi.append(story['epoch_num'])
    funcs.append(story['func'])

# рисуем график
for i in range(1,11,2):
    plt.plot(epochi[i], funcs[i], label='batch_size='+str(i * 1000))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('loss')
plt.savefig('grr3.pdf')

<Figure size 720x432 with 1 Axes>

In [24]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = [0]
accs = [0]
for k in range(1,11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=k* 1000, step_alpha=10, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append(story['time'])
    accs.append(story['acc'])

# рисуем график
for i in range(1,11,2):
    plt.plot(np.cumsum(times[i]), accs[i], label='batch_size='+str(i * 1000))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('accuracy')
plt.savefig('grr4.pdf')

<Figure size 720x432 with 1 Axes>

In [23]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = [0]
accs = [0]
for k in range(1,11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=k* 1000, step_alpha=10, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    epochi.append(story['epoch_num'])
    accs.append(story['acc'])

# рисуем график
for i in range(1,11,2):
    plt.plot(epochi[i], accs[i], label='batch_size='+str(i * 1000))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('accuracy')
plt.savefig('grr5.pdf')

<Figure size 720x432 with 1 Axes>

## зависимость от alpha

In [37]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for a in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(2, 11, 2):
    plt.plot(np.cumsum(times[i]), funcs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('loss')
plt.savefig('grr6.pdf')

<Figure size 720x432 with 1 Axes>

In [36]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
funcs = []
for a in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    epochi.append((story['epoch_num']))
    funcs.append(story['func'])

# рисуем график
for i in range(2, 11, 2):
    plt.plot(epochi[i], funcs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('loss')
plt.savefig('grr7.pdf')

<Figure size 720x432 with 1 Axes>

In [35]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
accs = []
for a in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    epochi.append((story['epoch_num']))
    accs.append(story['acc'])

# рисуем график
for i in range(2, 11, 2):
    plt.plot(epochi[i], accs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('accuracy')
plt.savefig('grr8.pdf')

<Figure size 720x432 with 1 Axes>

In [34]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for a in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=1,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append((story['time']))
    accs.append(story['acc'])

# рисуем график
for i in range(2, 11, 2):
    plt.plot(np.cumsum(times[i]), accs[i], label='step_alpha='+str(i))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('times')
plt.ylabel('accuracy')
plt.savefig('grr9.pdf')

<Figure size 720x432 with 1 Axes>

## зависимость от beta

In [51]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
for b in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append((story['time']))
    funcs.append(story['func'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(np.cumsum(times[i]), funcs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('loss')
plt.savefig('grr10.pdf')

<Figure size 720x432 with 1 Axes>

In [49]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
funcs = []
for b in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    epochi.append((story['epoch_num']))
    funcs.append(story['func'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(epochi[i], funcs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('loss')
plt.savefig('grr11.pdf')

<Figure size 720x432 with 1 Axes>

In [50]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
for b in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    times.append((story['time']))
    accs.append(story['acc'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(np.cumsum(times[i]), accs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)')
plt.ylabel('accuracy')
plt.savefig('grr12.pdf')

<Figure size 720x432 with 1 Axes>

In [52]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
accs = []
for b in range(11):
    sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=a, step_beta=b*0.2,
        tolerance=1e-8, max_iter=100, l2_coef = 0.1)
    story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
    epochi.append((story['epoch_num']))
    accs.append(story['acc'])

# рисуем график
for i in range(1, 11, 2):
    plt.plot(epochi[i], accs[i], label='step_beta='+str(round(i * 0.2,1)))
    plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num')
plt.ylabel('accuracy')
plt.savefig('grr13.pdf')

<Figure size 720x432 with 1 Axes>

## зависимость от начального приближения

In [86]:
w1 = np.random.normal(loc=0, scale=0.1, size=X_train.shape[1])
w2 = np.random.uniform(-0.1, 0.1, X_train.shape[1])

In [113]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
times.append(story['time'])
funcs.append(story['func'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w1, trace=True)
times.append(story['time'])
funcs.append(story['func'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w2, trace=True)
times.append(story['time'])
funcs.append(story['func'])

plt.plot(np.cumsum(times[2]), funcs[2], label='нулевой вектор')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[0]), funcs[0], label='нормальное распределение [0,0.1]')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[1]), funcs[1], label='равномерное распределение [-1,1]')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('gr14.pdf')

<Figure size 720x432 with 1 Axes>

In [112]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
funcs = []
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
epochi.append(story['epoch_num'])
funcs.append(story['func'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w1, trace=True)
epochi.append(story['epoch_num'])
funcs.append(story['func'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w2, trace=True)
epochi.append(story['epoch_num'])
funcs.append(story['func'])

plt.plot(epochi[2], funcs[2], label='нулевой вектор')
plt.legend(fontsize=10)

plt.plot(epochi[0], funcs[0], label='нормальное распределение [0,0.1]')
plt.legend(fontsize=10)

plt.plot(epochi[1], funcs[1], label='равномерное распределение [-1,1]')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('gr15.pdf')

<Figure size 720x432 with 1 Axes>

In [111]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
times.append(story['time'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w1, trace=True)
times.append(story['time'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w2, trace=True)
times.append(story['time'])
accs.append(story['acc'])

plt.plot(np.cumsum(times[2]), accs[2], label='нулевой вектор')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[0]), accs[0], label='нормальное распределение [0,0.1]')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[1]), accs[1], label='равномерное распределение [-1,1]')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr16.pdf')

<Figure size 720x432 with 1 Axes>

In [110]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
epochi = []
accs = []
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
epochi.append(story['epoch_num'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w1, trace=True)
epochi.append(story['epoch_num'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w2, trace=True)
epochi.append(story['epoch_num'])
accs.append(story['acc'])

plt.plot(epochi[2], accs[2], label='нулевой вектор')
plt.legend(fontsize=10)

plt.plot(epochi[0], accs[0], label='нормальное распределение [0,0.1]')
plt.legend(fontsize=10)

plt.plot(epochi[1], accs[1], label='равномерное распределение [-1,1]')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('gr17.pdf')

<Figure size 720x432 with 1 Axes>

### Эксперимент №5
СГ против СГС

In [141]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []

clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=200, l2_coef = 0.1)
story = clf.fit(X_train, y_train, X_test, y_test,trace=True)
times.append(story['time'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=1000, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
times.append(story['time'])
accs.append(story['acc'])

plt.plot(np.cumsum(times[0]), accs[0], label='GD')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[1]), accs[1], label='SGD')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('vs1.pdf')

<Figure size 720x432 with 1 Axes>

In [155]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
accs = []

clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=200, l2_coef = 0.1)
story = clf.fit(X_train, y_train, X_test, y_test, w_0 = w1,trace=True)
times.append(story['time'])
accs.append(story['acc'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=9000, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, w1, trace=True)
times.append(story['time'])
accs.append(story['acc'])

plt.plot(np.arange(len(accs[0])), accs[0], label='GD')
plt.legend(fontsize=10)

plt.plot(np.arange(len(accs[1])), accs[1], label='SGD')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('epoch_num', fontsize=10)
plt.ylabel('accuracy', fontsize=10)
plt.savefig('vs2.pdf')

<Figure size 720x432 with 1 Axes>

In [143]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'

plt.figure(figsize=(10, 6))
times = []
funcs = []

clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=200, l2_coef = 0.1)
story = clf.fit(X_train, y_train, X_test, y_test,trace=True)
times.append(story['time'])
funcs.append(story['func'])

sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=1000, l2_coef = 0.1)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
times.append(story['time'])
funcs.append(story['func'])

plt.plot(np.cumsum(times[0]), funcs[0], label='GD')
plt.legend(fontsize=10)

plt.plot(np.cumsum(times[1]), funcs[1], label='SGD')
plt.legend(fontsize=10)

plt.grid(True)
plt.legend()
plt.xlabel('time(sec)', fontsize=10)
plt.ylabel('loss', fontsize=10)
plt.savefig('vs3.pdf')

<Figure size 720x432 with 1 Axes>

### Эксперимент №6 Лемматизация

In [169]:
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ivandemyanov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ivandemyanov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [56]:
lemmatizer = WordNetLemmatizer()

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.001)
X = vectorizer.fit_transform(s)#.toarray()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X

<52061x3686 sparse matrix of type '<class 'numpy.int64'>'
	with 1787785 stored elements in Compressed Sparse Row format>

In [75]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()
lem = []
stop = set(stopwords.words('english'))
for doc in train['comment_text']:
    word_list = nltk.word_tokenize(doc)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list if w not in stop])
    lem.append(lemmatized_output)

In [178]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0.001)
X_lem = vectorizer.fit_transform(lem)#.toarray()
X_train_lem, X_test_lem, y_train_l, y_test_l = train_test_split(X_lem, y, test_size=0.3, random_state=42)
X_lem

<52061x3350 sparse matrix of type '<class 'numpy.int64'>'
	with 1039290 stored elements in Compressed Sparse Row format>

In [232]:
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train_lem, y_train_l, X_test_lem, y_test_l, trace=True)
story['acc'][-1]

0.896600294513093

In [229]:
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
story['acc'][-1]

0.879185607273193

In [227]:
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.001)
story = clf.fit(X_train, y_train, X_test, y_test,trace=True)
story['acc'][-1]

0.8330238811703694

In [228]:
clf = GDClassifier(loss_function = 'binary_logistic', step_alpha=10, step_beta=1,
    tolerance=1e-8, max_iter=100, l2_coef = 0.001)
story = clf.fit(X_train_lem, y_train_l, X_test_lem, y_test_l,trace=True)
story['acc'][-1]

0.8587617645175748

## эксперимент 7

In [283]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.001)
X = vectorizer.fit_transform(s)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=10, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
story['acc'][-1]

0.8135604071963634

In [240]:
X.shape

(52061, 3686)

In [282]:
from sklearn.feature_extraction.text import TfidfVectorizer
s = list(train['comment_text'])
vectorizer = TfidfVectorizer(min_df=0.001)
X_tf = vectorizer.fit_transform(s)
X_train, X_test, y_train, y_test = train_test_split(X_tf, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=10, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
story['acc'][-1]

0.8858441641590371

In [247]:
X_tf.shape

(52061, 3686)

### исследование min_df и max_df

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 88319)
0.887188680453294


In [295]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.001)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 3686)
0.879185607273193


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.001)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 3686)
0.879185607273193


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(max_df=0.001)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 84633)
0.7143863243485499


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(max_df=0.5)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 88316)
0.9008899417376273


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(max_df=0.5, min_df=0.001)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 3683)
0.8953198028042768


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(min_df=0.1)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 55)
0.498495422242141


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer(max_df=0.1)
X = vectorizer.fit_transform(s)
print(X.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
story = sclf.fit(X_train, y_train, X_test, y_test, trace=True)
print(story['acc'][-1])

(52061, 88264)
0.8967283436839747


## эксперимент 8

In [130]:
test

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,thank you for understanding i think very high...,False
1,1,dear god this site is horrible,False
2,2,somebody will invariably try to add relig...,False
3,3,it says it right there that it is a type ...,False
4,4,before adding a new product to the lis...,False
...,...,...,...
20671,63877,nigel is a crazy idiot,True
20672,63935,well now don t i feel stupid,True
20673,63945,fourth baldrick possibly being cleverer than...,True
20674,63962,iran that s right iran it was o...,True


In [25]:
test['comment_text'] = test['comment_text'].apply(lambda x: re.sub('[^a-z1-9]', ' ', str.lower(x)))

In [26]:
test

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,thank you for understanding i think very high...,False
1,1,dear god this site is horrible,False
2,2,somebody will invariably try to add relig...,False
3,3,it says it right there that it is a type ...,False
4,4,before adding a new product to the lis...,False
...,...,...,...
20671,63877,nigel is a crazy idiot,True
20672,63935,well now don t i feel stupid,True
20673,63945,fourth baldrick possibly being cleverer than...,True
20674,63962,iran that s right iran it was o...,True


In [159]:
from sklearn.feature_extraction.text import CountVectorizer
s = list(train['comment_text'])
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(s)#.toarray()

In [164]:
y = -1 * np.ones(X.shape[0])
y[train['is_toxic']] = 1

In [165]:
y_test = -1 * np.ones(X_test.shape[0])
y_test[test['is_toxic']] = 1

In [148]:
iter(['lol', 'al'])

<list_iterator at 0x121626880>

In [167]:
X_test = vectorizer.transform(iter(test['comment_text']))

In [168]:
X_test.shape

(20676, 91839)

In [127]:
y_test.shape

(52061,)

In [162]:
X.shape

(52061, 91839)

In [181]:
sclf = SGDClassifier(loss_function = 'binary_logistic', batch_size=5000, step_alpha=2, step_beta=0,
    tolerance=1e-8, max_iter=1000, l2_coef=0.0001)
sclf.fit(X, y, X_test, y_test, trace=True)
y_pred = sclf.predict(X_test)

In [189]:
sum(y_pred == y_test)/y_test.shape[0]

0.8525343393306248

In [177]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[11871,  2562],
       [  487,  5756]])

In [71]:
np.array(np.where((y_pred==1) & (y_test==-1)))

array([[  111,   117,   146,   161,   174,   193,   226,   285,   300,
          327,   372,   386,   390,   435,   438,   469,   518,   532,
          549,   580,   582,   584,   627,   690,   698,   731,   756,
          781,   799,   853,   882,   908,   909,   998,  1053,  1060,
         1110,  1112,  1159,  1167,  1200,  1257,  1275,  1299,  1307,
         1329,  1372,  1484,  1550,  1553,  1588,  1608,  1613,  1641,
         1729,  1738,  1754,  1772,  1794,  1798,  1818,  1891,  1910,
         1933,  1962,  1988,  2013,  2026,  2073,  2097,  2098,  2128,
         2166,  2167,  2211,  2245,  2254,  2270,  2272,  2278,  2288,
         2328,  2331,  2346,  2351,  2359,  2370,  2416,  2418,  2434,
         2441,  2445,  2460,  2465,  2466,  2491,  2498,  2536,  2552,
         2557,  2571,  2575,  2606,  2624,  2636,  2701,  2735,  2736,
         2737,  2763,  2780,  2802,  2832,  2847,  2865,  2880,  2924,
         2947,  2953,  2982,  2991,  3079,  3105,  3212,  3214,  3221,
      

In [133]:
test['comment_text'][11156]

'" \n\n ::seems to be more than the usual amount of idiocy around - LOL!  \n ::a) At various times, I\'ve made similar observations. i.e. You are by NO means ""an orphan"" in this respect. \n ::b) Actually, I suspect ""the usual amount of idiocy"" is fairly constant. I suspect the variable is your and my perception and/or level-of-tolerance of it ... \n ::Andreas is a difficult person to agree with - Andreas is a difficult person. \n ::I\'ve been a little worried about my comments in various places being taken offence to. - In my biased opinion, I see that as a good thing; I\'d be more worried if (like some) you always thought you were ""right"". \n ::Cheers,   "'

In [76]:
test

Unnamed: 0.1,Unnamed: 0,comment_text,is_toxic
0,0,thank you for understanding i think very high...,False
1,1,dear god this site is horrible,False
2,2,somebody will invariably try to add relig...,False
3,3,it says it right there that it is a type ...,False
4,4,before adding a new product to the lis...,False
...,...,...,...
20671,63877,nigel is a crazy idiot,True
20672,63935,well now don t i feel stupid,True
20673,63945,fourth baldrick possibly being cleverer than...,True
20674,63962,iran that s right iran it was o...,True


In [178]:
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'pdf'
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.winter):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title, fontsize=30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=20)
    plt.yticks(tick_marks, classes, fontsize=20)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.

    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", 
                 color="white" if cm[i, j] < thresh else "black", fontsize=20)

    plt.tight_layout()
    plt.xlabel('Predicted label', fontsize=20)

    return plt
cm = confusion_matrix(y_test, y_pred)
fig = plt.figure(figsize=(10, 10))
plot = plot_confusion_matrix(cm, classes=[-1,1], normalize=False, title='Confusion matrix')
plt.savefig('conf.pdf')

<Figure size 720x720 with 2 Axes>