# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30
pd.options.display.float_format = '{:,.4f}'.format

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
import seaborn as sns
#import plotly.plotly as py
#import plotly.graph_objs as go
#from plotly.offline import iplot, init_notebook_mode
#init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')


plotly.graph_objs.YAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.YAxis
  - plotly.graph_objs.layout.scene.YAxis



plotly.graph_objs.XAxis is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.layout.XAxis
  - plotly.graph_objs.layout.scene.XAxis




## Custom imports

In [2]:
import json

# Analysis/Modeling
Do work here

## Read data

In [3]:
with open("/home/luca/Desktop/QT/rucio-opint.web.cern.ch.json", 'r') as f:
    raw_data = json.load(f)

tot_errors = raw_data["count"]

In [4]:
print("Total number of errors:", tot_errors)

Total number of errors: 1819


In [5]:
errors = pd.DataFrame(raw_data["results"]).set_index("id")

errors.head()

Unnamed: 0_level_0,amount,category,dst_site,last_modified,message,src_site,status,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,178,9,BNL-ATLAS,2019-08-06T14:07:04.412401+02:00,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,New,transfer-failure
2,3288,10,BU_ATLAS_Tier2,2019-08-06T14:07:04.595574+02:00,TRANSFER [70] TRANSFER globus_xio: Unable to ...,BNL-ATLAS,New,transfer-failure
3,3288,10,BU_ATLAS_Tier2,2019-08-06T14:07:04.692554+02:00,TRANSFER [70] TRANSFER globus_xio: Unable to ...,INFN-NAPOLI-ATLAS,New,transfer-failure
4,227,11,CERN-PROD,2019-08-06T14:07:04.803866+02:00,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,New,transfer-failure
5,130,11,TOKYO-LCG2,2019-08-06T14:07:04.891438+02:00,SOURCE [70] globus_xio: Unable to connect to a...,BU_ATLAS_Tier2,New,transfer-failure


## Extract information
Once we read the data, we can focus on the variable *message* and try to extract meaningful information from it.

### Clearing and tokenizing messages

In [6]:
# Reduce to lowercase and split in tokens
tokens_per_message = [x.lower().split() for x in errors.message]

In [7]:
# Retrieve the set of all tokens used in the error messages
word_set = set()
for mess in tokens_per_message:
    word_set = word_set.union(set(mess))
    
word_set

{'():',
 '0',
 '1',
 '403',
 '451',
 '500',
 '530',
 '530-globus_gsi_callback_module:',
 '530-globus_xio:',
 '900',
 ':',
 '[110]',
 '[5]',
 '[70]',
 '[95]',
 '[gsiftp]',
 '[is_stagein=false]:',
 '[net=2001:1458:201:e3:0:0:100:21,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].',
 '[net=2001:1458:301:67:0:0:100:ac,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].',
 '[net=2001:1458:301:67:0:0:100:ac,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]',
 '[net=2001:1458:301:bc:0:0:100:19,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]',
 '[rc=10025,msg=no',
 '[se][ls][]',
 '[se][preparetoget][srm_failure]',
 '[se][preparetoget][srm_not_supported]',
 '[se][preparetoput][srm_not_supported]',
 '[se][srmrm][]',
 'actions',
 'after',
 'an',
 'api',
 'are',
 'at',
 'atlas-gridftp.bu.edu:2811',
 'attached',
 'attempts',
 'authentication',
 'available',
 'because',
 'been',
 'blacklisted',
 'blackli

In [8]:
print("We have {} error messages, for a total of {} unique tokens adopted.".format(
    len(tokens_per_message), len(word_set)))

We have 100 error messages, for a total of 205 unique tokens adopted.


In [9]:
# Setup a dictionary with frequency of all tokens per each message (initialized to 0)
word_dict = [dict.fromkeys(word_set, 0) for i in range(len(tokens_per_message))]
print("Number of tokens:", len(word_dict[0]))

word_dict[0]

Number of tokens: 205


{'could': 0,
 'failed.': 0,
 'write': 0,
 '[70]': 0,
 'or': 0,
 'atlas-gridftp.bu.edu:2811': 0,
 'httpg://head01.aglt2.org:8443/srm/managerv2:': 0,
 'globus_common:': 0,
 'file': 0,
 'exception': 0,
 'fts813.cern.ch': 0,
 '500': 0,
 'cgsi-gsoap': 0,
 'number': 0,
 '():': 0,
 'output': 0,
 'to': 0,
 'but': 0,
 'ssl3_get_server_certificate:': 0,
 'reports': 0,
 'name': 0,
 'verify': 0,
 'moment.': 0,
 'maximum': 0,
 'lcgdpmse.dnp.fmph.uniba.sk:8446': 0,
 'request': 0,
 'running': 0,
 '1': 0,
 'pools': 0,
 '0': 0,
 '530': 0,
 '451': 0,
 'failed:': 0,
 'after': 0,
 'been': 0,
 'stayed': 0,
 'excluded': 0,
 'out': 0,
 'an': 0,
 'supported,': 0,
 'connection': 0,
 'queue,': 0,
 'range': 0,
 'on': 0,
 'only': 0,
 'logs': 0,
 'ddmendpoint': 0,
 'httpg://storm-fe.cr.cnaf.infn.it:8444/srm/managerv2:': 0,
 'result': 0,
 'long': 0,
 'status': 0,
 'davposix::unlink': 0,
 'fts': 0,
 'refused': 0,
 'online': 0,
 'send,': 0,
 'actions': 0,
 'canceled': 0,
 'index': 0,
 'lcgfts02.gridpp.rl.ac.uk': 0,
 

In [11]:
# Compute raw frequencies of each token per each message
for i in range(len(errors.message)):
    for word in tokens_per_message[i]:
        word_dict[i][word] += 1
        
word_dict[0]

{'could': 0,
 'failed.': 1,
 'write': 0,
 '[70]': 1,
 'or': 0,
 'atlas-gridftp.bu.edu:2811': 1,
 'httpg://head01.aglt2.org:8443/srm/managerv2:': 0,
 'globus_common:': 1,
 'file': 0,
 'exception': 0,
 'fts813.cern.ch': 0,
 '500': 0,
 'cgsi-gsoap': 0,
 'number': 0,
 '():': 0,
 'output': 0,
 'to': 2,
 'but': 0,
 'ssl3_get_server_certificate:': 0,
 'reports': 0,
 'name': 1,
 'verify': 0,
 'moment.': 0,
 'maximum': 0,
 'lcgdpmse.dnp.fmph.uniba.sk:8446': 0,
 'request': 0,
 'running': 0,
 '1': 0,
 'pools': 0,
 '0': 0,
 '530': 0,
 '451': 0,
 'failed:': 0,
 'after': 0,
 'been': 0,
 'stayed': 0,
 'excluded': 0,
 'out': 0,
 'an': 0,
 'supported,': 0,
 'connection': 0,
 'queue,': 0,
 'range': 0,
 'on': 0,
 'only': 0,
 'logs': 0,
 'ddmendpoint': 0,
 'httpg://storm-fe.cr.cnaf.infn.it:8444/srm/managerv2:': 0,
 'result': 0,
 'long': 0,
 'status': 0,
 'davposix::unlink': 0,
 'fts': 0,
 'refused': 0,
 'online': 0,
 'send,': 0,
 'actions': 0,
 'canceled': 0,
 'index': 0,
 'lcgfts02.gridpp.rl.ac.uk': 0,
 

In [12]:
# Visualization
pd.DataFrame(word_dict)

Unnamed: 0,():,0,1,403,451,500,530,530-globus_gsi_callback_module:,530-globus_xio:,900,:,[110],[5],[70],[95],[gsiftp],[is_stagein=false]:,"[net=2001:1458:201:e3:0:0:100:21,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].","[net=2001:1458:301:67:0:0:100:ac,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].","[net=2001:1458:301:67:0:0:100:ac,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]","[net=2001:1458:301:bc:0:0:100:19,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]","[rc=10025,msg=no",[se][ls][],[se][preparetoget][srm_failure],[se][preparetoget][srm_not_supported],...,stalled,status,stayed,storm-fe.cr.cnaf.infn.it:8444,"supported,",supported:,taken,temporary,the,timed,to,too,transfer,turl,unable,unexpected,unknown,updated,uploaded.,user,verify,were,with,write,writing.
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,2,0,2,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
9,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0


### Compute TF-IDF scores
Once we have messages divided in words and we have computed the raw frequencies of each token in each sentence, then we can proceed and compute the **tf-idf** score for each message.

In [13]:
def compute_tf(word_dict, l):
    tf = {}
    sum_nk = len(l)
    for word, count in word_dict.items():
        tf[word] = count/sum_nk
    return tf

tf = [compute_tf(word_dict[i], tokens_per_message[i]) for i in range(len(tokens_per_message))]
# tf_A = compute_tf(word_dict_A, l_A)
# tf_B = compute_tf(word_dict_B, l_B)
# tf_C = compute_tf(word_dict_C, l_C)

In [14]:
def compute_idf(strings_list):
    n = len(strings_list)
    idf = dict.fromkeys(strings_list[0].keys(), 0)
    for l in strings_list:
        for word, count in l.items():
            if count > 0:
                idf[word] += 1
    
    for word, v in idf.items():
        idf[word] = np.log(n / float(v))
    return idf

idf = compute_idf(word_dict)
# idf = compute_idf([word_dict_A, word_dict_B, word_dict_C])

In [16]:
len(word_dict), len(tf), len(tf[0]), len(idf)

(100, 100, 205, 205)

In [30]:
tf_idf

[{'could': 0.0,
  'failed.': 0.17621954550317592,
  'write': 0.0,
  '[70]': 0.04317465735765885,
  'or': 0.0,
  'atlas-gridftp.bu.edu:2811': 0.17621954550317592,
  'httpg://head01.aglt2.org:8443/srm/managerv2:': 0.0,
  'globus_common:': 0.17621954550317592,
  'file': 0.0,
  'exception': 0.0,
  'fts813.cern.ch': 0.0,
  '500': 0.0,
  'cgsi-gsoap': 0.0,
  'number': 0.0,
  '():': 0.0,
  'output': 0.0,
  'to': 0.09394208190797312,
  'but': 0.0,
  'ssl3_get_server_certificate:': 0.0,
  'reports': 0.0,
  'name': 0.17621954550317592,
  'verify': 0.0,
  'moment.': 0.0,
  'maximum': 0.0,
  'lcgdpmse.dnp.fmph.uniba.sk:8446': 0.0,
  'request': 0.0,
  'running': 0.0,
  '1': 0.0,
  'pools': 0.0,
  '0': 0.0,
  '530': 0.0,
  '451': 0.0,
  'failed:': 0.0,
  'after': 0.0,
  'been': 0.0,
  'stayed': 0.0,
  'excluded': 0.0,
  'out': 0.0,
  'an': 0.0,
  'supported,': 0.0,
  'connection': 0.0,
  'queue,': 0.0,
  'range': 0.0,
  'on': 0.0,
  'only': 0.0,
  'logs': 0.0,
  'ddmendpoint': 0.0,
  'httpg://storm-

In [29]:
def compute_tf_idf(tf, idf):
    tf_idf = dict.fromkeys(tf.keys(), 0)
    for word, v in tf.items():
        tf_idf[word] = v * idf[word]
    return tf_idf
    
tf_idf =  [compute_tf_idf(tf[i], idf) for i in range(len(tf))]
# tf_idf_A = compute_tf_idf(tf_A, idf)
# tf_idf_B = compute_tf_idf(tf_B, idf)
# tf_idf_C = compute_tf_idf(tf_C, idf)

In [32]:
pd.DataFrame(tf_idf)

Unnamed: 0,():,0,1,403,451,500,530,530-globus_gsi_callback_module:,530-globus_xio:,900,:,[110],[5],[70],[95],[gsiftp],[is_stagein=false]:,"[net=2001:1458:201:e3:0:0:100:21,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].","[net=2001:1458:301:67:0:0:100:ac,protocol=dcap/3,store=atlas:atlasdatadisk@osm,cache=,linkgroup=]].","[net=2001:1458:301:67:0:0:100:ac,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]","[net=2001:1458:301:bc:0:0:100:19,protocol=gftp/2,store=atlas:atlasdatadisk@osm,cache=,linkgroup=atlasdatadisk_lg]","[rc=10025,msg=no",[se][ls][],[se][preparetoget][srm_failure],[se][preparetoget][srm_not_supported],...,stalled,status,stayed,storm-fe.cr.cnaf.infn.it:8444,"supported,",supported:,taken,temporary,the,timed,to,too,transfer,turl,unable,unexpected,unknown,updated,uploaded.,user,verify,were,with,write,writing.
0,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0432,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.2063,0.0000,0.0000,0.0939,0.0000,0.0000,0.0000,0.1762,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0408,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1948,0.0000,0.0000,0.0887,0.0000,0.0991,0.0000,0.1664,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0408,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1948,0.0000,0.0000,0.0887,0.0000,0.0991,0.0000,0.1664,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
3,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0432,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0939,0.0000,0.0000,0.0000,0.1762,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
4,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0432,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0939,0.0000,0.0000,0.0000,0.1762,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
5,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0657,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0887,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
6,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0539,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0729,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
7,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.1486,0.1486,0.1486,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0731,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0658,0.0000,0.0000,0.0000,0.1486,0.0000,0.0591,0.0000,0.0000
8,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0539,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0729,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
9,0.0000,0.0000,0.1125,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0604,0.0000,0.0319,0.0000,0.0000,0.0000,0.0000,0.0000,0.0816,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000


## Try clustering

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd


print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.02, stop_words='english',
                             use_idf=True)
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True)
X = vectorizer.fit_transform(errors.message)


In [72]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [83]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.02, stop_words='english',
                             use_idf=True)
# vectorizer = TfidfVectorizer(stop_words='english',
#                              use_idf=True)
X = vectorizer.fit_transform(errors.message)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()

print("Performing dimensionality reduction using LSA")
t0 = time()

# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD(25)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))

print()

Extracting features from the training dataset using a sparse vectorizer
done in 0.004272s
n_samples: 100, n_features: 163

Performing dimensionality reduction using LSA
done in 0.019443s
Explained variance of the SVD step: 98%



In [90]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1,
                verbose=1)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=6, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=1)
Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 43.3311292266567
start iteration
done sorting
end inner loop
Iteration 1, inertia 42.59942570928203
start iteration
done sorting
end inner loop
Iteration 2, inertia 42.59942570928203
center shift 0.000000e+00 within tolerance 3.418905e-06


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=6, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=1)

done in 0.008s



NameError: name 'labels' is not defined

# Results
Show graphs and stats here

# Conclusions and Next Steps
Summarize findings here