In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from numpy import save as save_array

from matplotlib import pyplot as plt

from contextlib import contextmanager
from time import time
from sys import stdout


@contextmanager
def measure(title):
    stdout.write('start %s... ' % title)
    stdout.flush()
    t0 = time()
    yield
    t1 = time()
    seconds = t1 - t0
    stdout.write('%02d:%06.3f\n' % (seconds / 60.0, seconds % 60))

In [None]:
#in_filename = 'output/parsed_logs_final'
in_filename = 'output/parsed_logs_final_without_banned_ips'

In [None]:
with measure('vectorize content'):
    vectorizer = TfidfVectorizer(
        binary=False,
        max_features=5000,
        ngram_range=(1, 1),
        max_df=0.25,
        min_df=2,
    )
    with open(in_filename) as f:
        raw_x = vectorizer.fit_transform(f)
print("n_samples: %d, n_features: %d" % raw_x.shape)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

SVD_DIMS = 20

print
print '## DIMS: %i ###' % SVD_DIMS
with measure('dimension reduction'):
    svd = TruncatedSVD(n_components=SVD_DIMS)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    x = lsa.fit_transform(raw_x)

explained_variance = svd.explained_variance_ratio_.sum()

print "variance of the SVD step:"
for dim in range(SVD_DIMS):
    print "dimension {}: {:0.2f}%".format(
        dim, svd.explained_variance_ratio_[dim] * 100
    )
print "Explained variance of the SVD step total: {:0.2f}%".format(
    explained_variance * 100
)
print("n_samples: %d, n_features: %d" % x.shape)

In [None]:
from sklearn.cluster import MiniBatchKMeans

CLUSTERS = 20

with measure('clustering'):
    km = MiniBatchKMeans(n_clusters=CLUSTERS)
    km.fit(x)


In [None]:
TERMS_PER_CLUSTER = 10

centers = km.cluster_centers_
clusters = len(centers)
original_space_centroids = svd.inverse_transform(centers)
order_centroids = original_space_centroids.argsort()[:, -TERMS_PER_CLUSTER:][:, ::-1]

terms = vectorizer.get_feature_names()
num_clusters = len(km.cluster_centers_)
size = float(len(x))
print 'total visitors:', size

for i in range(clusters):
    total_in_cluster = sum(km.labels_ == i)
    print "Cluster {:02d} (size: {:05.2f}%): {}".format(
        i + 1, total_in_cluster * 100.0 / size,
        ', '.join(terms[ind] for ind in order_centroids[i])
    )

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

classifiers = {
    'tree': DecisionTreeClassifier(),
    'qda': QuadraticDiscriminantAnalysis(),
    'forest': RandomForestClassifier(max_depth=5, n_estimators=20, max_features=10),
    'svc_linear': LinearSVC(dual=False),
}

labels = km.labels_
x_train, x_test, lab_train, lab_test = train_test_split(x, labels, test_size=0.2)

for name, classifier in classifiers.items():
    with measure('learning with class %s' % name):
        classifier.fit(x_train, lab_train)
    with measure('prediction with class %s' % name):
        score = classifier.score(x_test, lab_test)
    print 'score: %0.2f%%' % (score * 100.0)

In [None]:
import pickle

classifier = DecisionTreeClassifier()
with measure('classifing'):
    classifier.fit(x, labels)

with measure('pipeling all and saving...'): 
    pipeline = make_pipeline(vectorizer, lsa, classifier)
    with open('webapp/ml.pickle', 'w') as fp:
        pickle.dump(pipeline, fp)
        
    with open('webapp/favorite-words', 'w') as fp:
        for i in range(clusters):
            words = ', '.join(terms[ind] for ind in order_centroids[i])
            fp.write(words + '\n')