In [4]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline

# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

from bs4 import BeautifulSoup
from collections import OrderedDict # provides the ordered dictionary
import re # for regular expressions used below
import urllib # to read from URLs
import json
import networkx as nx # network analysis
from networkx.readwrite import json_graph
import itertools
import os.path
from datetime import datetime # for time measurement
import sys
import os
import pickle
import subprocess as subp
import gzip

from jellyfish import jaro_distance, jaro_winkler, hamming_distance, levenshtein_distance
import scipy.cluster.hierarchy as scipycluster

from skimage import io, exposure
from scipy.spatial import distance
# import the k-means algorithm
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import pairwise_distances_argmin,pairwise_distances_argmin_min, pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer

# OAI
from sickle import Sickle

import googlemaps

def printLog(text):
    now=str(datetime.now())
    print "["+now+"]\t"+text
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()
    
def pickleCompress(fileName,pickledObject):
    printLog("Pickling to '%s'" %fileName)
    f = gzip.open(fileName,'wb')
    pickle.dump(pickledObject,f)
    f.close()
    printLog("Pickling done.")
    
def pickleDecompress(fileName):
    #restore the object
    printLog("Depickling from '%s'" %fileName)
    f = gzip.open(fileName,'rb')
    pickledObject = pickle.load(f)
    f.close()
    printLog("Depickling done.")
    return pickledObject

In [2]:
df=pickleDecompress('clean_dataframe_with_century.picklez')

[2016-03-03 21:29:07.970434]	Depickling from 'clean_dataframe_with_century.picklez'
[2016-03-03 21:29:43.428282]	Depickling done.


In [14]:
tfidfvectorizer = TfidfVectorizer(min_df=1)
df2=df[df.title.notnull()]
corpus=df2.title.astype(str)
Xtfidf=tfidfvectorizer.fit_transform(corpus)
wordstfidf=tfidfvectorizer.get_feature_names()

In [24]:
true_k=100
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
# fit the k-means algorithm on the data created above
km.fit(Xtfidf)
# add the detected clusters as a new column to the original data frame
df2['cluster']=km.labels_
# group the data by the cluster and describe it
df2.groupby('cluster').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,century,latitude,longitude
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,179.000000,106.000000,106.000000
0,mean,16.810056,48.239131,13.924442
0,std,0.568568,11.457401,20.169983
0,min,15.000000,-30.766253,-74.072092
0,25%,17.000000,48.370545,10.649284
0,50%,17.000000,51.107885,13.137871
0,75%,17.000000,52.489032,14.550567
0,max,19.000000,59.934280,106.629664
1,count,482.000000,374.000000,374.000000
1,mean,16.946058,50.686737,11.225477


In [27]:
df2=df2.sort("cluster")
df2[['title','cluster']].tail(100)

  if __name__ == '__main__':


Unnamed: 0,title,cluster
7226,Umschlag mit der Aufschrift von Humboldts Hand...,99
91259,Glückwünschungs-Ode an den Hochwohlgebohrnen H...,99
16264,Bums - schon wieder ein Brummer!,99
115923,Theses oder Grundsätze von besondern Kauffmann...,99
72408,Brief von Adelbert von Chamisso an Antonie von...,99
98357,Die billige Trauer Bey dem schmertzlichen Hint...,99
51322,"Wo soll ich mich hinwenden, Bey der betrübten ...",99
38548,Copey und Abschrifft Des Schreibens/ Der Drey ...,99
84340,Iudicium Mundi Et Spiritus Sancti De Obitu Ius...,99
110257,Kriegs-Rath Vom Belial Und Seinen lieben getre...,99
