In [1]:
%matplotlib inline
import os
from lxml import etree
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Lars Buitinck
# License: BSD 3 clause

from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from collections import defaultdict


# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


In [2]:
df = pd.read_csv('dataframe.csv',sep='\t')
df = df.drop('Unnamed: 0', axis=1)

In [3]:
bigrams = set()
num = 1
for text in df['text']:
    text_clear = [word.lower().strip('.,!-?»«') for word in text.split()]
    text_2grams = list(ngrams(list(text_clear), 2))
    for i in text_2grams:
        bigrams.add(i)
        
bigrams = list(bigrams)

In [4]:
str_bigrams = [' '.join(bi) for bi in bigrams]

In [5]:
index = range(len(df['text']))

In [6]:
bigrams_per_sent = []
for i, text in enumerate(df['text']):
    text_clear = [word.lower().strip('.,!-?»«') for word in text.split()]  # биграммы из текста
    text_2grams = list(ngrams(list(text_clear), 2)) # биграммы из текста
    str_2grams = set([' '.join(bi) for bi in text_2grams])
    bigrams_per_sent.append(str_2grams)

In [7]:
data = df[:len(df)//5]

In [8]:
data.head()

Unnamed: 0,text,class
0,ты начинаешь злиться а я улыбаюсь я счастлива ...,1
1,но димон же зол на него поэтому стоит увиде...,1
2,вроде на улице зима а до меня это еще не дохо...,1
3,сел короче проверить мозги после нг пробный е...,0
4,место того чтоб учить я слушаю музыку класс,0


In [9]:
all_vectors = []
for i, text in enumerate(data['text']):
    vector = []
    for bi in str_bigrams:
        if bi in bigrams_per_sent[i]:
            vector.append(1)
        else:
            vector.append(0)
    all_vectors.append(vector)

print(len(all_vectors))

3904


In [10]:
X_train, X_test, y_train, y_test = train_test_split(all_vectors, data['class'], test_size=0.3)

In [None]:
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

In [11]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()


duration = time() - t0


print("Extracting features from the test data using the same vectorizer")
t0 = time()


duration = time() - t0

print("Extracting best features by a chi-squared test")
t0 = time()
ch2 = SelectKBest(chi2)
X_train__tf_ch = ch2.fit_transform(X_train, y_train)
X_test__tf_ch = ch2.transform(X_test)
print("done in %fs" % (time() - t0))

Extracting features from the training data using a sparse vectorizer
Extracting features from the test data using the same vectorizer
Extracting best features by a chi-squared test
done in 51.076381s


In [None]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    print()
    clf_descr = str(clf).split('(')[0]
    if "penalty" in vars(clf):
        return clf_descr+ ' '+ str(clf.penalty), score, train_time, test_time
    else:
        return clf_descr, score, train_time, test_time
        
        

results = []
for clf, name in (
        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
        (Perceptron(n_iter=50), "Perceptron"),
        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
        (KNeighborsClassifier(n_neighbors=10), "kNN"),
        (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty,
                                            dual=False, tol=1e-3)))

    # Train SGD model
    results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                           penalty=penalty)))

# Train SGD with Elastic Net penalty
print('=' * 80)
print("Elastic-Net penalty")
results.append(benchmark(SGDClassifier(alpha=.0001, n_iter=50,
                                       penalty="elasticnet")))

# Train NearestCentroid without threshold
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train Logistic Regression
print('=' * 80)
print("Logistic Regression")
results.append(benchmark(LogisticRegression()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC( penalty="l1", dual=False, tol=1e-3))),
  ('classification', LinearSVC())
])))


Ridge Classifier
________________________________________________________________________________
Training: 
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='lsqr',
        tol=0.01)
train time: 94.892s
test time:  76.732s
accuracy:   0.625

Perceptron
________________________________________________________________________________
Training: 
Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
      n_iter=50, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False)
train time: 106.781s
test time:  7.862s
accuracy:   0.573

Passive-Aggressive
________________________________________________________________________________
Training: 
PassiveAggressiveClassifier(C=1.0, class_weight=None, fit_intercept=True,
              loss='hinge', n_iter=50, n_jobs=1, random_state=None,
              shuffle=True, verbose=0, warm_start=False)


In [17]:
# plot

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)


plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
plt.barh(indices + .3, training_time, .2, label="training time",
         color='c')
plt.barh(indices + .6, test_time, .2, label="test time", color='darkorange')
plt.yticks(())
plt.legend(loc='best')
plt.subplots_adjust(left=.25)
plt.subplots_adjust(top=1)
plt.subplots_adjust(bottom=.05)

for i, c in zip(indices, clf_names):
    plt.text(-.3, i, c)
plt.show()

NameError: name 'results' is not defined

In [21]:
trigrams = set()
num = 1
for text in df['text']:
    text_clear = [word.lower().strip('.,!-?»«') for word in text.split()]
    text_3grams = list(ngrams(list(text_clear), 3))
    for i in text_3grams:
        trigrams.add(i)
        
trigrams = list(trigrams)

In [22]:
str_trigrams = [' '.join(tri) for tri in trigrams]
index = range(len(df['text']))

In [23]:
trigrams_per_sent = []
for i, text in enumerate(df['text']):
    text_clear = [word.lower().strip('.,!-?»«') for word in text.split()]  # биграммы из текста
    text_3grams = list(ngrams(list(text_clear), 3)) # биграммы из текста
    str_3grams = set([' '.join(tri) for tri in text_3grams])
    trigrams_per_sent.append(str_3grams)