In [38]:
# This is the lab to test the classification strategy in an efficient way
from cgnenhancer import extract_urls, Article, Context, read_contexts_from_txt

# Imports relative to learning
import numpy
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [39]:
# Construct the dataset (there are unnecessary steps)
# EVALUATE THIS CELL ONLY ONCE! (very slow)
contexts_filename = "./contexts.txt"
url_to_tag, tag_to_url = extract_urls("./urls.html")
contexts_dict = read_contexts_from_txt(contexts_filename)
contexts = list(contexts_dict.keys())
contexts_mapper = {i + 1: contexts[i] for i in range(len(contexts))}
name = contexts_mapper[1]
current_context = Context(name, contexts_dict[name], url_to_tag)
articles = [Article(art_url) for art_url in current_context.articles]

Unable to open url https://www.marxists.org/archive/lafargue/1883/lazy/
Error:  Forbidden
Unable to open url http://www.bawdseyradar.org.uk/wp-content/uploads/2012/12/Wilkins-Calculations.pdf
Error:  Not Found
Unable to open url https://learn.adafruit.com/digital-free-library/preparation
Error:  Forbidden
Unable to open url https://www.digitalocean.com/community/tutorials/how-to-create-an-off-site-backup-of-your-site-with-rsync-on-centos-6
Error:  Forbidden
Unable to open url https://www.digitalocean.com/community/tutorials/how-to-use-rsync-to-sync-local-and-remote-directories-on-a-vps
Error:  Forbidden
Unable to open url https://twitter.com/nonquella/status/818298067854196736?s=09
Error:  Not Found
Unable to open url http://www.kossboss.com/linux---move-running-to-process-nohup
Error:  Not Found
Unable to open url https://t.co/vksT7fpkY0
Error:  Not Found
Unable to open url https://codingsec.net/2016/05/brute-forcing-directories-file-locations/
Error:  Forbidden
Unable to open url htt

In [79]:
tags = []
texts = []
for article in articles:
    for tag in url_to_tag[article.url]:
        if tag not in tags:
            tags.append(tag)
    texts.append(article.content)

# Generate one feature vector for each text
corpus_label_vectors = numpy.array([[1 if t in url_to_tag[article.url] else 0 for t in tags] for article in articles])
print(tags)
print(len(tags))

['', 'cyber security', 'system administration', 'film', 'fisica', 'politica interna', 'technology', 'musica', 'libri', 'giugno2017', 'filosofia politica', 'isica', 'raspberry', 'machine learning', 'arduino', 'latex', 'università', 'gender study', 'neuroscienze', 'canzoni', 'storia', 'monte paschi', 'python', 'fdo nuovo', 'tav', 'serie tv', 'medio oriente', 'terremoto centro italia', 'bologna', 'rss', 'ex-soviet union']
31


In [100]:
pattern = '(?u)\\b[A-Za-z]{3,}'
cv = CountVectorizer(max_df=0.95,
                                 min_df=0.01,
                                 stop_words='english',
                                 token_pattern=pattern,
                                 ngram_range=(1, 3))
# complete corpus 
cv_corpus = cv.fit_transform(texts)
tfidf = TfidfTransformer(sublinear_tf=True)
# tf idf matrix: (articles x words)
tfidf_corpus_matrix = tfidf.fit_transform(cv_corpus)

print("The complete tf-idf matrix has shape:{}".format(tfidf_corpus_matrix.shape))

# Exclude the current article from the training set
test_articles = articles[ : 15]
for t in test_articles:
    print("[+] ", t.title)
    print("   Loaded correctly: ", t.loaded)
# 
tsize = len(test_articles)
tfidf_train_matrix = tfidf_corpus_matrix[tsize:, :]
label_vectors = corpus_label_vectors[tsize:]
print("The training set has shape",tfidf_train_matrix.shape)

# lists for learning
# NOTE: learning is performed assigning a binary vector (a category)
# to each text. A 1 in the i-th position means that the article was
# tagged using the i-th tag
    
# very basic linear search for the best parameter
for C_i in numpy.arange(1, 10000,100):
    classifier = LinearSVC(C=C_i,loss='squared_hinge', penalty='l2', dual=False, tol=1e-3)
    multiclass_clf = OneVsRestClassifier(classifier)
    multiclass_clf.fit(tfidf_train_matrix, label_vectors)
    # create the matrix for the current article
    cv_test = cv.transform([t.content for t in test_articles])
    tfidf_test = tfidf.transform(cv_test)
    #
    score = multiclass_clf.score(tfidf_test, corpus_label_vectors[:tsize])
    print("C_i:{:f} Score:{:f}".format(C_i, score))


The complete tf-idf matrix has shape:(121, 9644)
[+]  BAHFest West 2014 - Matt Inman: Jibbers Crabst - YouTube
   Loaded correctly:  True
[+]  Sarah Jamie Lewis su Twitter: "Honestly, the entire dark web intelligence industry and their relationships with LE frequently appear to cross ethical and legal lines.… https://t.co/k525LnxUd9"
   Loaded correctly:  True
[+]  4 easy Linux projects for newbies and intermediate users | PCWorld
   Loaded correctly:  True
[+]  Il lato positivo (2012) | FilmTV.it
   Loaded correctly:  True
[+]  PhD in theoretical physics
   Loaded correctly:  True
[+]  Statistical Physics and Condensed Matter
   Loaded correctly:  True
[+]  Wind and Physics
   Loaded correctly:  True
[+]  variational principle - Why are there only derivatives to the first order in the Lagrangian? - Physics Stack Exchange
   Loaded correctly:  True
[+]  The F-35 Is a $1.4-Trillion National Disaster | War Is Boring
   Loaded correctly:  True
[+]  Every Noise at Once - industrial
   Load

  str(classes[c]))


C_i:1.000000 Score:0.000000


  str(classes[c]))


C_i:101.000000 Score:0.000000


  str(classes[c]))


C_i:201.000000 Score:0.000000


  str(classes[c]))


C_i:301.000000 Score:0.000000


  str(classes[c]))


C_i:401.000000 Score:0.000000


  str(classes[c]))


C_i:501.000000 Score:0.000000


  str(classes[c]))


C_i:601.000000 Score:0.000000


  str(classes[c]))


C_i:701.000000 Score:0.000000


  str(classes[c]))


C_i:801.000000 Score:0.000000


  str(classes[c]))


C_i:901.000000 Score:0.000000


  str(classes[c]))


C_i:1001.000000 Score:0.000000


  str(classes[c]))


C_i:1101.000000 Score:0.000000


  str(classes[c]))


C_i:1201.000000 Score:0.000000


  str(classes[c]))


C_i:1301.000000 Score:0.000000


  str(classes[c]))


C_i:1401.000000 Score:0.000000


  str(classes[c]))


C_i:1501.000000 Score:0.000000


  str(classes[c]))


C_i:1601.000000 Score:0.000000


  str(classes[c]))


C_i:1701.000000 Score:0.000000


  str(classes[c]))


C_i:1801.000000 Score:0.000000


  str(classes[c]))


C_i:1901.000000 Score:0.000000


  str(classes[c]))


C_i:2001.000000 Score:0.000000


  str(classes[c]))


C_i:2101.000000 Score:0.000000


  str(classes[c]))


C_i:2201.000000 Score:0.000000


  str(classes[c]))


C_i:2301.000000 Score:0.000000


  str(classes[c]))


C_i:2401.000000 Score:0.000000


  str(classes[c]))


C_i:2501.000000 Score:0.000000


  str(classes[c]))


C_i:2601.000000 Score:0.000000


  str(classes[c]))


C_i:2701.000000 Score:0.000000


  str(classes[c]))


C_i:2801.000000 Score:0.000000


  str(classes[c]))


C_i:2901.000000 Score:0.000000


  str(classes[c]))


C_i:3001.000000 Score:0.000000


  str(classes[c]))


C_i:3101.000000 Score:0.000000


  str(classes[c]))


C_i:3201.000000 Score:0.000000


  str(classes[c]))


C_i:3301.000000 Score:0.000000


  str(classes[c]))


C_i:3401.000000 Score:0.000000


  str(classes[c]))


C_i:3501.000000 Score:0.000000


  str(classes[c]))


C_i:3601.000000 Score:0.000000


  str(classes[c]))


C_i:3701.000000 Score:0.000000


  str(classes[c]))


C_i:3801.000000 Score:0.000000


  str(classes[c]))


C_i:3901.000000 Score:0.000000


  str(classes[c]))


C_i:4001.000000 Score:0.000000


  str(classes[c]))


C_i:4101.000000 Score:0.000000


  str(classes[c]))


C_i:4201.000000 Score:0.000000


  str(classes[c]))


C_i:4301.000000 Score:0.000000


  str(classes[c]))


C_i:4401.000000 Score:0.000000


  str(classes[c]))


C_i:4501.000000 Score:0.000000


  str(classes[c]))


C_i:4601.000000 Score:0.000000


  str(classes[c]))


C_i:4701.000000 Score:0.000000


  str(classes[c]))


C_i:4801.000000 Score:0.000000


  str(classes[c]))


C_i:4901.000000 Score:0.000000


  str(classes[c]))


C_i:5001.000000 Score:0.000000


  str(classes[c]))


C_i:5101.000000 Score:0.000000


  str(classes[c]))


C_i:5201.000000 Score:0.000000


  str(classes[c]))


C_i:5301.000000 Score:0.000000


  str(classes[c]))


C_i:5401.000000 Score:0.000000


  str(classes[c]))


C_i:5501.000000 Score:0.000000


  str(classes[c]))


C_i:5601.000000 Score:0.000000


  str(classes[c]))


C_i:5701.000000 Score:0.000000


  str(classes[c]))


C_i:5801.000000 Score:0.000000


  str(classes[c]))


C_i:5901.000000 Score:0.000000


  str(classes[c]))


C_i:6001.000000 Score:0.000000


  str(classes[c]))


C_i:6101.000000 Score:0.000000


  str(classes[c]))


C_i:6201.000000 Score:0.000000


  str(classes[c]))


C_i:6301.000000 Score:0.000000


  str(classes[c]))


C_i:6401.000000 Score:0.000000


  str(classes[c]))


C_i:6501.000000 Score:0.000000


  str(classes[c]))


C_i:6601.000000 Score:0.000000


  str(classes[c]))


C_i:6701.000000 Score:0.000000


  str(classes[c]))


C_i:6801.000000 Score:0.000000


  str(classes[c]))


C_i:6901.000000 Score:0.000000


  str(classes[c]))


C_i:7001.000000 Score:0.000000


  str(classes[c]))


C_i:7101.000000 Score:0.000000


  str(classes[c]))


C_i:7201.000000 Score:0.000000


  str(classes[c]))


C_i:7301.000000 Score:0.000000


  str(classes[c]))


C_i:7401.000000 Score:0.000000


  str(classes[c]))


C_i:7501.000000 Score:0.000000


  str(classes[c]))


C_i:7601.000000 Score:0.000000


  str(classes[c]))


C_i:7701.000000 Score:0.000000


  str(classes[c]))


C_i:7801.000000 Score:0.000000


  str(classes[c]))


C_i:7901.000000 Score:0.000000


  str(classes[c]))


C_i:8001.000000 Score:0.000000


  str(classes[c]))


C_i:8101.000000 Score:0.000000


  str(classes[c]))


C_i:8201.000000 Score:0.000000


  str(classes[c]))


C_i:8301.000000 Score:0.000000


  str(classes[c]))


C_i:8401.000000 Score:0.000000


  str(classes[c]))


C_i:8501.000000 Score:0.000000


  str(classes[c]))


C_i:8601.000000 Score:0.000000


  str(classes[c]))


C_i:8701.000000 Score:0.000000


  str(classes[c]))


C_i:8801.000000 Score:0.000000


  str(classes[c]))


C_i:8901.000000 Score:0.000000


  str(classes[c]))


C_i:9001.000000 Score:0.000000


  str(classes[c]))


C_i:9101.000000 Score:0.000000


  str(classes[c]))


C_i:9201.000000 Score:0.000000


  str(classes[c]))


C_i:9301.000000 Score:0.000000


  str(classes[c]))


C_i:9401.000000 Score:0.000000


  str(classes[c]))


C_i:9501.000000 Score:0.000000


  str(classes[c]))


C_i:9601.000000 Score:0.000000


  str(classes[c]))


C_i:9701.000000 Score:0.000000


  str(classes[c]))


C_i:9801.000000 Score:0.000000


  str(classes[c]))


C_i:9901.000000 Score:0.000000


In [101]:
classifier = LinearSVC(C=2e4,loss='squared_hinge', penalty='l1', dual=False, tol=1e-3)
multiclass_clf = OneVsRestClassifier(classifier)
multiclass_clf.fit(tfidf_train_matrix, label_vectors)
# create the matrix for the current article
cv_test = cv.transform([t.content for t in test_articles])
tfidf_test = tfidf.transform(cv_test)
#
suggested_label_vectors = multiclass_clf.predict(tfidf_test)
    

suggested_tags = [[] for i in test_articles]
print()
print(suggested_label_vectors)
for i in range(len(suggested_label_vectors)):
    print("Article:", test_articles[i].title)
    print("Tags:")
    for j in range(len(suggested_label_vectors[i])):
        if suggested_label_vectors[i][j] == 1:
            suggested_tags[i].append(tags[j])
    print("  Suggested:", suggested_tags[i])
    print("  True:", url_to_tag[articles[i].url])

  str(classes[c]))



[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Article: BAHFest West 2