## Offline part

check to see if the compressed tdm is available, then jump over the calculation
otherwise read from the URL with the zip file and build it

In [1]:
import pandas as pd
import numpy as np

# Read files from zip
arc_path = "/Users/dima/Google Drive/CUNY_MSDA/Data_602_Python/Final_Project/data/bbc-fulltext.zip"

def read_corpus_from_zip(archive_path):
    '''
    WRITE DOCSTRING
    '''
    
    import zipfile
    import os
    
    with zipfile.ZipFile(archive_path) as z:

        docs = []
        terms = []
        # need to read only .txt files but not those hidden ones created by the MacOS

        for filename in [f for f in z.namelist() if (not f.startswith('_'))& f.endswith('.txt')]:
            with z.open(filename) as f:
                docs.append(filename)
                words = [word for line in f for word in line.split()]
                terms.append(words)
    
    print 'Created a corpus with {} documents'.format(len(docs))
    return {'docs' : docs,'terms' : terms}

corpus = read_corpus_from_zip(arc_path)



Created a corpus with 2225 documents


[['Ad',
  'sales',
  'boost',
  'Time',
  'Warner',
  'profit',
  'Quarterly',
  'profits',
  'at',
  'US',
  'media',
  'giant',
  'TimeWarner',
  'jumped',
  '76%',
  'to',
  '$1.13bn',
  '(\xc2\xa3600m)',
  'for',
  'the',
  'three',
  'months',
  'to',
  'December,',
  'from',
  '$639m',
  'year-earlier.',
  'The',
  'firm,',
  'which',
  'is',
  'now',
  'one',
  'of',
  'the',
  'biggest',
  'investors',
  'in',
  'Google,',
  'benefited',
  'from',
  'sales',
  'of',
  'high-speed',
  'internet',
  'connections',
  'and',
  'higher',
  'advert',
  'sales.',
  'TimeWarner',
  'said',
  'fourth',
  'quarter',
  'sales',
  'rose',
  '2%',
  'to',
  '$11.1bn',
  'from',
  '$10.9bn.',
  'Its',
  'profits',
  'were',
  'buoyed',
  'by',
  'one-off',
  'gains',
  'which',
  'offset',
  'a',
  'profit',
  'dip',
  'at',
  'Warner',
  'Bros,',
  'and',
  'less',
  'users',
  'for',
  'AOL.',
  'Time',
  'Warner',
  'said',
  'on',
  'Friday',
  'that',
  'it',
  'now',
  'owns',
  '8%',


In [2]:
# Process terms: Remove punctuation & numbers, lowercase, remove articles and auxillary verbs


def clean_strings(string_list, 
                  stopwords = ['is','are','was','were','would','has','have','shall',
                               'a','an','the','in','on','at','for','of','with','by','to','as',
                              'and','or','s','th']):
    '''
    WRITE DOCSTRING
    '''
    
    import string
    string_list = [word.lower() for word in string_list] # lowercase
    string_list = [''.join(c for c in word if c in string.ascii_lowercase) for word in string_list] # keep letters only
    string_list = [word for word in string_list if word] # drop empty records
    string_list = [word for word in string_list if word not in stopwords] # remove stopwords
    return string_list


corpus['terms'] = [clean_strings(item) for item in corpus['terms']]


In [8]:
# Count each term per document and create a pandas dataframe with the term frequencies

def count_tf(term_list,doc_name, cutoff = 1):
    '''
    WRITE DOCSTRING
    Counts a simple term frequency value for each unique term in a document
    '''
    
    from scipy.stats import itemfreq
    freqs = itemfreq(term_list)
    df = pd.DataFrame(freqs)
    df.columns = ['term','freq']
    #df.index = df['term']
    #df.drop('term', axis=1, inplace=True)
    df['freq'] = df['freq'].astype(int)
    df = df[df['freq']>cutoff]
    df['freq'] = df['freq']/df['freq'].sum()
    
    return {'doc_name':doc_name, 'freqs':df}

def create_tdm(corpus,count_cutoff = 1):
    '''
    WRITE DOCSTRING
    '''
    
    import progressbar
    
    tf_list = []
    docs_list = []
    print "Counting distinct words per document"
    
    bar = progressbar.ProgressBar(maxval=len(corpus['docs']), 
                                  widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                           progressbar.Percentage()])
    
    bar.start()
    for i in range(0,len(corpus['docs'])):
        tf_dict = count_tf(corpus['terms'][i],corpus['docs'][i],cutoff = count_cutoff)
        tf = tf_dict['freqs']
        #tf = tf[tf['freq']>cutoff]
        tf_list.append(tf)
        docs_list.append(tf_dict['doc_name'])
        bar.update(i+1)
    bar.finish()
    
    print "Merging into tdm, this may take a while"
    #tdm = pd.concat(tf_list)
    tdm = reduce(lambda left,right: pd.merge(left,right,on='term',how='outer'), tf_list)
    tdm = tdm.fillna(0)
    tdm.columns = ['term'] + docs_list
    print 'Constructed tdm, {} unique words from {} documents'.format(tdm.shape[0],tdm.shape[1])
    return tdm
    
tdm = create_tdm(corpus,2)

Counting distinct words per document




Merging into tdm, this may take a while
Constructed tdm, 5997 unique words from 2226 documents


In [9]:
tdm.head(10)

Unnamed: 0,term,bbc/business/001.txt,bbc/business/002.txt,bbc/business/003.txt,bbc/business/004.txt,bbc/business/005.txt,bbc/business/006.txt,bbc/business/007.txt,bbc/business/008.txt,bbc/business/009.txt,...,bbc/tech/392.txt,bbc/tech/393.txt,bbc/tech/394.txt,bbc/tech/395.txt,bbc/tech/396.txt,bbc/tech/397.txt,bbc/tech/398.txt,bbc/tech/399.txt,bbc/tech/400.txt,bbc/tech/401.txt
0,aol,0.05102,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bn,0.05102,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,fourth,0.030612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,from,0.040816,0.0,0.0,0.045977,0.0,0.0,0.0,0.0,0.0,...,0.050633,0.0,0.0,0.015873,0.0,0.037736,0.054545,0.009868,0.0,0.006574
4,internet,0.040816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,it,0.112245,0.0,0.058824,0.08046,0.083333,0.0,0.065217,0.0,0.0,...,0.050633,0.0,0.079167,0.05291,0.010526,0.028302,0.0,0.055921,0.0,0.024836
6,its,0.061224,0.0,0.098039,0.034483,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0125,0.042328,0.017544,0.0,0.0,0.0,0.0,0.002922
7,m,0.05102,0.0,0.058824,0.08046,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,now,0.030612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.028302,0.0,0.0,0.0,0.002191
9,our,0.030612,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002191


In [7]:
# Store the TDM
#tdm.to_csv('/Users/dima/Google Drive/CUNY_MSDA/Data_602_Python/Final_Project/tdm.gz', compression='gzip')


In [10]:
def tf_idf(tdm):
    '''
    WRITE DOCSTRING
    ADD ERRORS
    '''
    n_docs = tdm.shape[1]-1
    non_zero_count = n_docs - (tdm == 0).sum(axis=1)
    idf = np.log(n_docs/non_zero_count)
    
    tdm.iloc[:,1:(n_docs+1)] = tdm.iloc[:,1:(n_docs+1)].mul(idf, axis=0)
    
    return tdm

tdm = tf_idf(tdm)
tdm.head()

Unnamed: 0,term,bbc/business/001.txt,bbc/business/002.txt,bbc/business/003.txt,bbc/business/004.txt,bbc/business/005.txt,bbc/business/006.txt,bbc/business/007.txt,bbc/business/008.txt,bbc/business/009.txt,...,bbc/tech/392.txt,bbc/tech/393.txt,bbc/tech/394.txt,bbc/tech/395.txt,bbc/tech/396.txt,bbc/tech/397.txt,bbc/tech/398.txt,bbc/tech/399.txt,bbc/tech/400.txt,bbc/tech/401.txt
0,aol,0.357876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bn,0.139679,0.0,0.0,0.0,0.342212,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,fourth,0.165457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,from,0.062347,0.0,0.0,0.07023,0.0,0.0,0.0,0.0,0.0,...,0.077342,0.0,0.0,0.024246,0.0,0.057641,0.083318,0.015074,0.0,0.010042
4,internet,0.167208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.051207,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate the matrix of term correlations A*A^t

In [12]:
# Calculate the matrix of correlations A * AT

def cor_matrix(data_frame) :
    '''
    Calculates a correlation matrix by multiplying the input matrix with the transposed version
    ADD ERRORS
    ADD check to drop string columns
    '''
    tdm.drop('term',axis=1,inplace=True)
    inp_mat = data_frame.as_matrix()
    out_mat = np.dot(inp_mat, inp_mat.transpose())
    return out_mat

def 

TypeError: can't multiply sequence by non-int of type 'str'

In [21]:
#tdm.index = tdm['term']
tdm.drop('term',axis=1,inplace=True)
inp_mat = tdm.as_matrix()

#cm = cor_matrix(tdm)

In [23]:
out_mat = np.dot(inp_mat, inp_mat.transpose())

In [33]:
# Calculate SVD of the resulting matrix

from scipy import linalg

U, s, Vt = linalg.svd(out_mat, full_matrices = True)

In [34]:
U.shape, s.shape, Vt.shape

((5996, 5996), (5996,), (5996, 5996))

In [35]:
s

array([  5.94057442e+01,   5.94057442e+01,   5.94057442e+01, ...,
         1.52821743e-16,   1.26306115e-16,   1.16014433e-16])

In [46]:
# Subset top k eigenvalues to get a reduced SVD form

def trunc_svd(matrix,k):
    
    from scipy import linalg
    # Calculate SVD
    u, sigma, vt = linalg.svd(matrix, full_matrices = True)
    
    # Reduce to k largest eigenvalues
    u_k = u[:, :k]
    sigma_k = sigma[:k]
    vt_k = vt[:k,:]
    
    return u_k,sigma_k,vt_k

In [47]:
Uk,Sk,Vtk = trunc_svd(out_mat, 100)

In [49]:
Sk.shape

(100,)

In [54]:
# Calculate a matrix of terms as columns in the semantic space of k topics
Yk = np.dot(np.diag(Sk),Uk.transpose())

In [66]:
def cosine_dist(v1,v2):
    '''
    Calculates the cosine distance between two vectors
    '''
    out = np.dot(v1,v2)/(linalg.norm(v1)*linalg.norm(v2))
    return out

cosine_dist(Yk[:,1], Yk[:,4])

0.15257610055111454

KeyError: 'term'

NameError: name 'DataFrame' is not defined

In [None]:
b = pd.DataFrame(Yk)
b.columns = tdm['term']



In [None]:
import plotly.plotly as py
py.iplot(, filename='jupyter/table1')