In [3]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import pandas as pd
import numpy as np
pd.options.display.max_rows = 20

# Data read and prep

The first step we take is to read in our data.

In [53]:
alltex = pd.read_csv("alltexts copy.csv")
alltex = alltex.drop(alltex.columns[0], axis=1) #drop the first column, which is just the row numbers from the csv
alltex

Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),
3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),


Next, we filter out languages we don't care about. To do this, we look at the 'lang' column, and select only those rows where the 'lang' value starts with 'sux'

In [54]:
alltex = alltex.loc[[str(i)[:3] == 'sux' for i in alltex.lang], :]
alltex

Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),
3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),


Next, we'd like to filter out the rows that have nulls in the cf, gw, or pos columns. In other words, we keep columns that *don't* have NA in gw AND *don't* have NA in form AND *don't* have NA in pos. 

In [55]:
alltex = alltex.loc[np.logical_and(~pd.isnull(alltex.gw),
                         np.logical_and(~pd.isnull(alltex.form),
                                       ~pd.isnull(alltex.pos))
                         )]
alltex

Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),
3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),


Next, I filter out "X" and "(X)". We want rows where gw is *not* X or (X) AND form is *not* X or (X) AND pos is *not* X or (X).

In [56]:
xstrings = ["X", "(X)"]

In [57]:
alltex = alltex.loc[np.logical_and(~np.in1d(alltex.gw, xstrings) ,
                         np.logical_and(~np.in1d(alltex.form, xstrings),
                                       ~np.in1d(alltex.pos, xstrings))
                         )]
alltex

Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),
3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),


At this point, we're ready to start lemmatizing, or creating the "terms". Terms look like this for each word: 

cf + '[' + gw + ']' + pos 

To do this, we'll paste together the appropriate column entries for each row, and create a new column called "Term".

In [58]:
alltex['Term'] = [str(alltex.cf[i]) + "[" + str(alltex.gw[i]) + "]" + str(alltex.pos[i]) for i in alltex.index]
alltex

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version,Term
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),,dubsaŋ[first]AJ
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),,Enki[1]DN
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),,unu[dwelling]N
3,gal,,gal,big,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,gal[big]V/i
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,ed[ascend]V/i
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),,anzag[horizon]N
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,anŋi[eclipse]N
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),,zu[know]V/t
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,ama[mother]N
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,tu[incantation]N


Now, with these terms, we can filter out stop words and "stop docs".
* Stop word: A term to common in the language to be useful for classification; examples in English include the articles (a, an, the), as they are used in nearly every document and thus are not useful for distinguishing between documents.
* "Stop doc": A document that we would not like to include for whatever reason, such as being too short

The things we want to filter out could change, so I'll write a function to do this filtering.

In [59]:
def filter(table, stopwords = [], stopdocs = []):
    """
    Parameters:
        table: Pandas dataframe
            The table to modify/filter.
        stopwords: Arraylike, Strings
            A collection of strings of terms to filter out
        stopdocs: Arraylike, Strings
    Returns a table without the terms and documents specified
    """
    return table.loc[np.logical_and(~np.in1d(table.Term, stopwords), ~np.in1d(table.id_text, stopdocs))]

In [60]:
stopwords = np.array(pd.read_csv("../stopwords_top21.txt", header=None)[0])
stopdocs = np.array(pd.read_csv("../stopdocuments_less50.txt", header = None)[0])
stopwords,stopdocs

(array(['dug[speak]V/t', 'ki[place]N', 'šu[hand]N', 'gal[big]V/i',
        'lu[person]N', 'e[house]N', 'ŋar[place]V/t', 'šag[heart]N',
        'kur[mountain]N', 'lugal[king]N', 'ud[sun]N', 'Enlil[1]DN',
        'igi[eye]N', 'e[leave]V/i', 'kug[pure]V/i', 'an[sky]N',
        'saŋ[head]N', 'ak[do]V/t', 'gub[stand]V/i', 'en[lord]N',
        'ŋal[be]V/i'], dtype=object),
 array(['c.0.2.05', 'c.0.2.12', 'c.2.4.1.8', 'c.2.4.2.12', 'c.2.4.2.23',
        'c.2.4.4.5', 'c.2.4.4.9', 'c.2.5.2.3', 'c.2.5.4.16', 'c.2.5.5.8',
        'c.2.6.9.a', 'c.2.8.2.5', 'c.2.8.3.4', 'c.2.8.3.7', 'c.2.8.3.8',
        'c.2.99.b', 'c.2.99.c', 'c.2.99.d', 'c.3.1.10', 'c.3.1.13.1',
        'c.3.2.01', 'c.3.3.06', 'c.3.3.07', 'c.3.3.12', 'c.3.3.20',
        'c.3.3.27', 'c.4.08.12', 'c.4.08.13', 'c.4.08.22', 'c.4.13.b',
        'c.4.13.d', 'c.4.19.4', 'c.4.22.3', 'c.4.27.a', 'c.4.30.1',
        'c.5.7.3', 'c.5.7.a', 'c.6.2.4'], dtype=object))

In [62]:
alltex = filter(alltex, stopwords=stopwords, stopdocs = stopdocs)
alltex

Unnamed: 0,cf,extent,form,gw,id_text,lang,line_no,line_ref,pos,status,text_name,version,Term
0,dubsaŋ,,dub-saŋ-ta,first,c.0.1.1,sux,1,1,AJ,,Ur III catalogue from Nibru (N1),,dubsaŋ[first]AJ
1,Enki,,{d}en-ki,1,c.0.1.1,sux,2,2,DN,,Ur III catalogue from Nibru (N1),,Enki[1]DN
2,unu,,unu₂,dwelling,c.0.1.1,sux,2,2,N,,Ur III catalogue from Nibru (N1),,unu[dwelling]N
4,ed,,im-ed₃,ascend,c.0.1.1,sux,2,2,V/i,,Ur III catalogue from Nibru (N1),,ed[ascend]V/i
5,anzag,,an-zag-še₃,horizon,c.0.1.1,sux,3,3,N,,Ur III catalogue from Nibru (N1),,anzag[horizon]N
6,anŋi,,an-ŋi₆,eclipse,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,anŋi[eclipse]N
7,zu,,zu,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),,zu[know]V/t
8,ama,,ama,mother,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,ama[mother]N
9,tu,,tu₆,incantation,c.0.1.1,sux,4,4,N,,Ur III catalogue from Nibru (N1),,tu[incantation]N
10,zu,,zu-ke₄,know,c.0.1.1,sux,4,4,V/t,,Ur III catalogue from Nibru (N1),,zu[know]V/t


At this point, we are ready to create a tfidf matrix.

In [143]:
np.unique(alltex.Term).shape #this gives us an idea of how many columns we should see in the tfidf 

(4302,)

# TFIDF

We'd like to numerically represent our data in a useful way. For this purpose, we create a document-term matrix: a matrix with terms as the column labels, and documents as the rows. The value in the cell at position (document, term) will have a number corresponding to how frequently that word appears in the document. However, if the word is very common across all our documents, it won't be as helpful in classification. So we'd like to "penalize" words that appear too frequently, where "too frequently" is based on how many documents it appears in. Wanting a measure these two things
* How often some term t appears in a particular document d
* How often this term t appears in all documents (and penalize for this)
leads us to the TFIDF (term frequency inverse document frequency measure).

We calculate a TFIDF by the formula
$$
tfidf(t, d, D) = tf(t,d)\cdot idf(t,D)
$$
where $t$ is the term, $d$ is the document, and $D$ is the corpus.
Here, $tf(t,d)$ is just going to be a function returning the raw count of term t in document d. The idf will be  
$$
log\frac{1 + N}{1 + n_t} + 1
$$
where N is the number of documents in the corpus and n_t is the number of docs where the term appears (formally $|{d \in D: t \in d}|$). The +1s in the log are to safeguard against log(0) or division by 0 errors, while the +1 outside ensures a positive value.

This means the overall expression for tfidf is
$$
f_{t,d}\cdot ln(\frac{N}{n_t})
$$
The full workings of the following procedure are documented here: http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

To create this tfidf, we use sklearn's vectorizer. But before doing that, we need to represent our corpus as a list of documents, which it currently is not (it's currently split up in our dataframe). To do this, we aggregate:

In [70]:
grouped = alltex.groupby("id_text").aggregate(" ".join) #this line of code lumps together all documents' terms together, and joins them as a single space separated string
grouped

Unnamed: 0_level_0,cf,form,gw,lang,line_no,pos,text_name,Term
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
c.0.1.1,dubsaŋ Enki unu ed anzag anŋi zu ama tu zu gi ...,dub-saŋ-ta {d}en-ki unu₂ im-ed₃ an-zag-še₃ an-...,first 1 dwelling ascend horizon eclipse know m...,sux sux sux sux sux sux sux sux sux sux sux su...,1 2 2 2 3 4 4 4 4 4 5 5 6 7 7 7 8 8 8 8 9 10 1...,AJ DN N V/i N N V/t N N V/t N V/t N N N V/i N ...,Ur III catalogue from Nibru (N1) Ur III catalo...,dubsaŋ[first]AJ Enki[1]DN unu[dwelling]N ed[as...
c.0.1.2,diŋir šembizida dar kur mete gud banda šerzid ...,diŋir šembi₂-zid-da dar-a nu-kur₂ me-te-bi gud...,deity kohl split different appropriate-thing o...,sux sux sux sux sux sux sux sux sux sux sux su...,1 1 1 2 3 5 5 6 7 7 7 8 9 9 10 10 10 10 11 11 ...,N N V/t V/i N N V/i N N V/i N N N V/i N N NU V...,Ur III catalogue at Yale (Y1) Ur III catalogue...,diŋir[deity]N šembizida[kohl]N dar[split]V/t k...
c.0.2.01,mi niŋdu nin me šar sud nam nun re innin me hu...,mi₂ niŋ₂-du₇-e nin me šar₂-ra su₃-ra₂-še₃ nam₂...,cvne appropriate-thing lady Being 3600 distant...,sux sux sux sux sux sux sux sux sux sux sux su...,2 3 4 4 4 5 6 6 7 8 8 8 9 9 10 11 11 12 12 13 ...,N N N N NU V/i N N DP N N V/i N V/i V/i V/i N ...,OB catalogue from Nibru (N2) OB catalogue from...,mi[cvne]N niŋdu[appropriate-thing]N nin[lady]N...
c.0.2.02,sud nam nun re innin me huš ud huš til hursaŋ ...,su₃-ra₂-še₃ nam₂ nun-e re-a in-nin me huš-a ud...,distant lord prince that lady Being reddish st...,sux sux sux sux sux sux sux sux sux sux sux su...,5 6 6 7 8 8 8 9 9 10 11 12 12 13 13 14 15 15 1...,V/i N N DP N N V/i N V/i V/i N N V/i N N DP DP...,OB catalogue in the Louvre (L) OB catalogue in...,sud[distant]V/i nam[lord]N nun[prince]N re[tha...
c.0.2.03,mu niŋul Waradsin Inab me ŋuruš šir rah enbar ...,mu-ni niŋ₂-ul-še₃ IR₃-{d}suen i₃-na-ab i-me-a ...,name everlasting 1 1 be male song beat reed fr...,sux sux sux sux sux sux sux sux sux sux sux su...,1 1 2 3 3 4 4 4 5 5 6 7 8 9 10 10 10 11 11 12 ...,N N RN SN V/i N N V/t N N N RN V/i N N N V/i Q...,OB catalogue from Urim (U1) OB catalogue from ...,mu[name]N niŋul[everlasting]N Waradsin[1]RN In...
c.0.2.04,nin mulan buršuma niŋziŋal ursaŋ mi meta di me...,nin mul-an-gin₇ bur-šum₂-ma zi-ŋal₂ ur-saŋ-me-...,lady star dowager living-creature hero cvne wh...,sux sux sux sux sux sux sux sux-x-emesal sux s...,1 1 2 3 4 5 6 6 7 7 8 8 8 9 10 10 11 11 12 13 ...,N N N N N N QP V/i QP V/i N N NU V/i J V/i V/i...,OB catalogue from Urim (U2) OB catalogue from ...,nin[lady]N mulan[star]N buršuma[dowager]N niŋz...
c.0.2.06,ursaŋ alim mah mea lu meʾam RI-RI iri re me ga...,ur-saŋ alim mah me-a lu me-a-am nu-mu-e-de₃-RI...,hero bison great where? abundant dear RI-RI ci...,sux sux sux sux sux sux sux sux sux sux sux su...,1 1 1 2 2 3 4 5 6 7 7 8 8 9 9 10 10 11 12 12 1...,N N V/i QP V/i J V N DP N V/i QP N DN DN NU V/...,OB catalogue from Nibru (N3) OB catalogue from...,ursaŋ[hero]N alim[bison]N mah[great]V/i mea[wh...
c.0.2.07,uru2 er balaŋ Nin.X illalum aya illalum balaŋ ...,uru₂ am₃-ma-er₂-ra-bi balaŋ {d}nin-X il-la-lum...,town tears instrument 1 expression cry express...,sux-x-emesal sux sux sux sux sux sux sux sux s...,A1 A1 A2 A2 A4 A4 A4 A5 A5 A6 A6 A6 A7 A7 A7 A...,N N N DN J N J N DN N V/i N NU N DN NU N NU N ...,OB catalogue possibly from Zimbir (B1) OB cata...,uru2[town]N er[tears]N balaŋ[instrument]N Nin....
c.0.2.08,a-ce-er il nir atuku gel-le-ej3 mah uma gig um...,a-še-er il₂ nir a₂-tuku gel-le-eŋ₃-ŋa₂-ŋu₁₀ ma...,lament raise trust powerful to-be-lost great t...,sux-x-emesal sux sux sux sux-x-emesal sux sux ...,A4 A6 B3 B3 B4 B6 B7 B7 B9 B9 B10 B10 B11 B11 ...,N V/t N N V V/i N V/i N V/t N V/i DN N V/i DN ...,OB catalogue from Nibru (N4) OB catalogue from...,a-ce-er[lament]N il[raise]V/t nir[trust]N atuk...
c.0.2.11,innin gur innin me huš nin me šar An Uraš inni...,in-nin gur₄-ra in-nin me huš-a nin me šar₂-ra ...,lady thick lady Being reddish lady Being 3600 ...,sux sux sux sux sux sux sux sux sux sux sux su...,1 1 2 2 2 3 3 3 4 4 5 5 6 8 9 10 11 11 12 19 1...,N V/i N N V/i N N NU DN DN N DN DN N N V/i N N...,OB catalogue at Andrews University (B4) OB cat...,innin[lady]N gur[thick]V/i innin[lady]N me[Bei...


In [142]:
cv = CountVectorizer(token_pattern='[^ ]+', lowercase = False) #initialize a count vectorizer object with correct tokenization for this term format
dtm = cv.fit_transform(list(grouped.Term))
tt = TfidfTransformer(use_idf=True)
dtm_tf = tt.fit_transform(dtm)
dtm_tf.shape #356 docs, 4301 unique terms, as expected

(356, 4302)

In [150]:
tfidf = pd.DataFrame(dtm_tf.toarray(), columns=cv.get_feature_names(), index=grouped.index)
tfidf

Unnamed: 0_level_0,1-kam-ma[1st]NU,1/2[1/2]NU,1/3[1/3]NU,10-kam-ma[10th]NU,1000[1000]NU,100[100]NU,108000[108000]NU,1080[1080]NU,108[108]NU,10[10]NU,...,šusi[finger]N,šuteŋ[accept]V/t,šutubur[mixture]N,šutug[reed-hut]N,šutum[storehouse]N,šutur[garment]N,šuš[cover]V/t,šuʾi[barber]N,šuʾu[stone]N,šuʾura[goose]N
id_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c.0.1.1,0.22871,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.1.2,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.091941,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.01,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.02,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.03,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.04,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.06,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.07,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.08,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
c.0.2.11,0.00000,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.136534,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


# Differences between documents

Now, with a TFIDF matrix, we can begin assessing similarity of documents by treating each row of this matrix as a respresentation of that document (specifically, as a vector) in N dimensional Euclidian space, where N is the number of features (words). Similarity of documents can be interpreted as being the distance between documents in this N space. Here, we will be interested in two measures of distance: cosine distance and Euclidian (L2) distance. 

## Cosine Distance

A cosine similarity (https://en.wikipedia.org/wiki/Cosine_similarity) is defined as the $cos\theta$ of two vectors, and can be calculated from the dot product of two vectors. Thus, to determine cosine distance between two documents, we must calculate the dot products of their vector representations. 

Suppose we have a vector space with n dimensions $\mathbb{V_n}$ (think of the regular 2d Euclidian plane as  $\mathbb{V_2}$), and vectors $\boldsymbol{X},\boldsymbol{Y} \in \mathbb{V_n}$. We can represent $\boldsymbol{X}$ and $\boldsymbol{Y}$ as column vectors of size n:
$$
\boldsymbol{X} = \begin{bmatrix}
           x_{1} \\
           x_{2} \\
           \vdots \\
           x_{n}
         \end{bmatrix}, 
\boldsymbol{Y} = \begin{bmatrix}
           y_{1} \\
           y_{2} \\
           \vdots \\
           y_{n}
         \end{bmatrix}
$$

Then, the dot product of these two vectors is defined as 

$$
\boldsymbol{X}\cdot \boldsymbol{Y} = 
           x_{1}y_{1} +
           x_{2}y_{2} +
           \dots +
           x_{n}y_{n}
$$

The geometric intepretation of the dot product is defined as follows: 
$$
\boldsymbol{X}\cdot\boldsymbol{Y} = |\boldsymbol{X}||\boldsymbol{Y}|cos\theta 
$$
where $|\boldsymbol{X}|$ and $|\boldsymbol{Y}|$ are the so-called "norms" of the vectors (essentially, the lengths of the vectors). 
These norms are defined as the square root of the dot product of a vector with itself, so 
$$|\boldsymbol{X}| = \sqrt{\boldsymbol{X}\cdot \boldsymbol{X} }
           =\sqrt{x_{1}x_{1} +
           x_{2}x_{2} +
           \dots +
           x_{n}x_{n}}$$
Then, we can solve for $cos \theta$
$$
cos \theta = \frac{\boldsymbol{X}\cdot\boldsymbol{Y}}{\sqrt{|\boldsymbol{X}|}\sqrt{|\boldsymbol{Y}|}}  
$$
In other words, cosine similarity is 
$$
\frac{\boldsymbol{X}\cdot\boldsymbol{Y}}{(\boldsymbol{X}\cdot\boldsymbol{X})(\boldsymbol{Y}\cdot\boldsymbol{Y})} 
=    \frac{x_{1}y_{1} +
           x_{2}y_{2} +
           \dots +
           x_{n}y_{n}}
           {
           \sqrt{(x_{1}x_{1} +
           x_{2}x_{2} +
           \dots +
           x_{n}x_{n})}
           \sqrt{(y_{1}y_{1} +
           y_{2}y_{2} +
           \dots +
           y_{n}y_{n})}
           }
$$
Cosine distance is simply 1 - the cosine similarity.

To actually use this, we can use scipy.spacial.disance's cosine function. 

In [188]:
from scipy.spatial.distance import cosine
#some demos
test1 = np.arange(10)
test2 = 3 * test1
test3 = -1 * test1
test4 = np.zeros(10)
test4[0] = 1
test5 = np.zeros(10)
test5[1] = 1
print(cosine(test1, test2)) #should be 1 - 1 = 0, 1 - cosine of 0
print(cosine(test1, test3)) #should be 1 - (-1) = 2, 1 - cosine of pi
print(cosine(test4, test5)) #should be 1 , 1 - cosine of pi/2

1.11022302463e-16
2.0
1.0


But anyway, let's test it out on some stuff:
We expect the distance to be high between clusters 6 and 1, and close between documents within 6. Let's see if this reflects that.

In [189]:
np.mean([cosine(tfidf.iloc[-1], tfidf.iloc[-i]) for i in np.arange(15)]),np.mean([cosine(tfidf.iloc[-1], tfidf.iloc[i]) for i in np.arange(15)])

(0.69187535425551472, 0.88791964785245947)

So looking at the average distance between the last doc (from cluster 6) and 15 other docs from cluster 6, the average distance is lower than the difference between that document and the documents from the other end, 15 docs from clusters 0 and 1. Makes sense.

## Euclidean distance

Another metric we might like to use is the Euclidian distance, which can be more natural to think about. That is, given two vectors (like the ones described above), we can compute the distance between the endpoints of those vectors as such:
$$
d = \sqrt{
(x_1 - y_1)^2 +
(x_2 - y_2)^2 + 
\dots + 
(x_n - y_n)^2 
}
$$
You might recognize this as a generalized version of the Pythagorean theorem.
Also note that this can be related to the above picture of vectors, because this exactly corresponds to the norm of the vector $\boldsymbol{X} - \boldsymbol{Y}$:
$$
|\boldsymbol{X} - \boldsymbol{Y}| = 
\Huge\lvert\small\begin{bmatrix}
x_1 - y_1 \\
x_2 - y_2 \\
\vdots\\
x_n - y_n
\end{bmatrix}\Huge\rvert
\normalsize= \sqrt{
(x_1 - y_1)^2 +
(x_2 - y_2)^2 + 
\dots + 
(x_n - y_n)^2 
}
$$
Using this definition, we can easily define a function to calculate a Euclidian distance. 

However, one caveat is that distance can be largely affected by lengths of vectors; it is apparent that vectors of distinctly different lengths will be determined to be far apart, even in the body of their text is very similar (eg, compare ['Hello World'] vs ['Hello World Hello World Hello World Hello World Hello World']). To correct for this, we can normalize the vectors (ie, divide the vector by its length, which is equivalent to dividing every single element by the vector's length/norm). All normalized vectors then have length 1. After performing this operation, the Euclidian distance can then be calculated for a more accurate idea of the distance between vectors. 

In [200]:
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import norm
print(euclidean(test1, test2))
print(euclidean(test1, test3))
print(euclidean(test4, test5))

33.7638860323
33.7638860323
1.4142135623730951


Notice that these values are too large to make sense; the furthest distance two points can be from one another on a unit sphere is 2. Normalizing will help:

In [201]:
print(euclidean(test1/norm(test1), test2/norm(test2)))
print(euclidean(test1/norm(test1), test3/norm(test3)))
print(euclidean(test4/norm(test4), test5/norm(test5)))

1.2412670766236366e-16
1.9999999999999998
1.4142135623730951
