In [1]:
# This is the lab to test the classification strategy in an efficient way
from cgnenhancer import extract_urls, Article, Context, read_contexts_from_txt

# Imports relative to learning
import numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline


In [2]:
# Construct the dataset (there are unnecessary steps)
# EVALUATE THIS CELL ONLY ONCE! (very slow)
contexts_filename = "./contexts.txt"
url_to_tag, tag_to_url = extract_urls("./new_urls.html")
contexts_to_url, url_to_contexts = read_contexts_from_txt(contexts_filename)
contexts = list(contexts_to_url.keys())
contexts_mapper = {i + 1: contexts[i] for i in range(len(contexts))}
name = contexts_mapper[1]
current_context = Context(name, contexts_to_url[name], url_to_tag)
# All the articles in the following list are valid (Article() returns None if url can't be reached)
articles = [Article(art_url) for art_url in current_context.articles]  

Unable to open url https://www.yanisvaroufakis.eu/2018/05/28/president-mattarella-of-italy-from-moral-drift-to-tactical-blunder/
Error:  Forbidden
Unable to open url https://jamesclear.com/articles
Error:  Forbidden
Unable to open url https://centre.santafe.edu/thermocomp/Santa_Fe_Institute_Collaboration_Platform:Thermodynamics_of_Computation_Wiki
Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)
Unable to open url https://medium.com/@74ff82e532bf/4abfe5a30ba
Error:  Forbidden
Unable to open url https://www.lamiavitainvaligia.org/
Error:  Forbidden
Unable to open url https://www.mooc-list.com/course/understanding-clinical-research-behind-statistics-coursera
Error:  Forbidden
Unable to open url https://www.markdowntutorial.com/lesson/1/
Error:  Forbidden
Unable to open url https://www.r-bloggers.com/arma-models-for-trading/
Error:  Forbidden
Unable to open url https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
Error:  Forbi

In [5]:
tags = []
texts = []
for article in articles:
    for tag in url_to_tag[article.url]:
        if tag not in tags:
            tags.append(tag)
    texts.append(article.content)

# Generate the label vectors
corpus_label_vectors = numpy.array([[1 if t in url_to_tag[article.url] else 0 for t in tags] for article in articles])
print(tags)
print(len(tags))

['cgnenhancer', 'machine learning', 'pdf', 'news', 'alt-right', 'capitalismo digitale', 'egemonia', 'ironia', 'même', 'postmodern', 'cyber security', 'facebook', 'privacy', 'signal', 'whatsapp', 'marxism', 'politica', 'usa', 'musica', 'trap', 'vaporwave', 'fascismo', 'liberismo', 'pasolini', 'esaltati', 'python', 'software engineering', 'documentation', 'data mining', 'neuroscienze', 'visualization', 'aggregator', 'podcast', 'governo conte', 'italia', 'lega', 'debunking', 'bulgaria', 'china', 'surveillance', 'analisi economica', 'politiche culturali', 'scuola', 'governo', 'tav', 'linear regression', 'statistics', 'bologna', 'gentrificazione', 'londra', 'sharing economy', 'catalogna', 'via rasella', 'film', 'ungheria', 'lecture', 'blog', 'information management', 'communication', 'blogs', 'technology', 'razzismo', 'storia', 'filosofia politica', 'patria', 'risorgimento', 'tricolore', 'diritto', 'economia', 'europa', 'letture', 'politica interna', 'gtd', 'psycology', 'imprenditori', 'eco

In [17]:
pattern = '(?u)\\b[A-Za-z]{3,}'
cv = CountVectorizer(max_df=0.95,
                     min_df=0.01,
                     stop_words='english',
                     token_pattern=pattern,
                     ngram_range=(1, 7))
# complete corpus 
cv_corpus = cv.fit_transform(texts)
tfidf = TfidfTransformer(sublinear_tf=True)
# tf idf matrix: (articles x words)
tfidf_corpus_matrix = tfidf.fit_transform(cv_corpus)

print("The complete tf-idf matrix has shape:{}".format(tfidf_corpus_matrix.shape))

# Exclude the current article from the training set
test_articles = articles[ : 50]
tsize = len(test_articles)
tfidf_train_matrix = tfidf_corpus_matrix[tsize:, :]
label_vectors = corpus_label_vectors[tsize:]

print("The training set has shape",tfidf_train_matrix.shape)

The complete tf-idf matrix has shape:(497, 13989)
The training set has shape (447, 13989)


In [None]:
# Model comparison pipeline
pipeline_steps = [
    # Count words which are between 3 & 30 characters, appear at least 4 times, appear in less than 50% of documents, and are not known English topic-neutral words.
    ('vectorize', TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=4, stop_words='english', token_pattern='[a-zA-Z]{3,30}')),

    # This should not have an effect at training sample sizes <= 10000 or so, which is fine; it's a safety check to restrict the number of features to something we can train & test on.
    ('reduce_dim', SelectKBest(chi2, k=200000))
]


def train(clf):
    print('=' * 80)
    print("Training: ")
    print(clf)

    steps = pipeline_steps + [('classify', OneVsRestClassifier(clf))]
    pipeline = Pipeline(steps=steps)

    t0 = time()
    pipeline.fit(train_texts, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

return pipeline

models = [
    # add other models here
    Perceptron(class_weight={ 0: 1, 1: 4 }, n_iter=50),
    LinearSVC(C=1e5,loss='squared_hinge', penalty='l1', dual=False, tol=1e-3)
]

multiclass_clf = OneVsRestClassifier(classifier)
multiclass_clf.fit(tfidf_train_matrix, label_vectors)
# create the matrix for the current article
cv_test = cv.transform([t.content for t in test_articles])
tfidf_test = tfidf.transform(cv_test)
#
suggested_label_vectors = multiclass_clf.predict(tfidf_test)
    

suggested_tags = [[] for i in test_articles]
print()
print(suggested_label_vectors)
for i in range(len(suggested_label_vectors)):
    print("Article:", test_articles[i].title)
    print("Tags:")
    for j in range(len(suggested_label_vectors[i])):
        if suggested_label_vectors[i][j] == 1:
            suggested_tags[i].append(tags[j])
    print("  Argmax, max:", )
    print("  Suggested:", suggested_tags[i])
    print("  True:", url_to_tag[articles[i].url])

In [14]:
# Single model visualization
classifier = LinearSVC(C=3e4,loss='squared_hinge', penalty='l1', dual=False, tol=1e-3)
multiclass_clf = OneVsRestClassifier(classifier)
multiclass_clf.fit(tfidf_train_matrix, label_vectors)
# create the matrix for the current article
cv_test = cv.transform([t.content for t in test_articles])
tfidf_test = tfidf.transform(cv_test)
#
suggested_label_vectors = multiclass_clf.predict(tfidf_test)
    

suggested_tags = [[] for i in test_articles]
print()
print(suggested_label_vectors)
for i in range(len(suggested_label_vectors)):
    print("Article:", test_articles[i].title)
    print("Tags:")
    for j in range(len(suggested_label_vectors[i])):
        if suggested_label_vectors[i][j] == 1:
            suggested_tags[i].append(tags[j])
    print("  Argmax, max:", )
    print("  Suggested:", suggested_tags[i])
    print("  True:", url_to_tag[articles[i].url])

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))



[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Article: research_interest_tagger/README.md at master · academia-edu/research_interest_tagger · GitHub
Tags:
  Argmax, max:
  Suggested: ['hack', 'habits']
  True: ['cgnenhancer', 'machine learning', 'pdf']
Article: KNN with TF-IDF based Framework for Text Categorization - ScienceDirect
Tags:
  Argmax, max:
  Suggested: ['statistics', 'blog', 'thermodynamics', 'hack']
  True: ['cgnenhancer', 'machine learning']
Article: machine learning - Unsupervised automatic tagging algorithms? - Stack Overflow
Tags:
  Argmax, max:
  Suggested: ['computer science', 'phd']
  True: ['cgnenhancer', 'machine learning', 'news']
Article: Nella tempesta di merda. Bifo legge Kill All Normies di Angela Nagle e la cultura dei meme di destra
Tags:
  Argmax, max:
  Suggested: ['filosofia politica', 'intersezionalismo', 'gender study']
  True: ['alt-right', 'capitalismo digitale', 'egemonia', 

In [1]:
# Different pipeline using gensim
from gensim import corpora, models, similarities

from cgnenhancer import extract_urls, Article, Context, read_contexts_from_txt

import numpy

In [7]:
# EVALUATE THIS CELL ONLY ONCE! (very slow)
contexts_filename = "./contexts.txt"
url_to_tag, tag_to_url = extract_urls("./new_urls.html")
# Only valid (readable) articles included
articles = [Article(art_url) for art_url in url_to_tag.keys()]  
contexts_to_url, url_to_contexts = read_contexts_from_txt(contexts_filename)
contexts = [Context(c, articles, url_to_tag) for c in contexts_to_url.keys()]
contexts_mapper = {i + 1: contexts[i].name for i in range(len(contexts))}
name = contexts_mapper[1]
current_context = Context(name, contexts_to_url[name], url_to_tag)

Unable to open url https://www.yanisvaroufakis.eu/2018/05/28/president-mattarella-of-italy-from-moral-drift-to-tactical-blunder/
Error:  Forbidden
Unable to open url https://jamesclear.com/articles
Error:  Forbidden
Unable to open url https://centre.santafe.edu/thermocomp/Santa_Fe_Institute_Collaboration_Platform:Thermodynamics_of_Computation_Wiki
Error:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)
Unable to open url https://medium.com/@74ff82e532bf/4abfe5a30ba
Error:  Forbidden
Unable to open url https://www.lamiavitainvaligia.org/
Error:  Forbidden
Unable to open url https://www.mooc-list.com/course/understanding-clinical-research-behind-statistics-coursera
Error:  Forbidden
Unable to open url https://www.markdowntutorial.com/lesson/1/
Error:  Forbidden
Unable to open url https://www.r-bloggers.com/arma-models-for-trading/
Error:  Forbidden
Unable to open url https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/
Error:  Forbi

KeyError: <cgnenhancer.cgnenhancer.Article object at 0x7f50312f0400>