# Document Term Matrix preparation

In [24]:
import pandas as pd
import numpy as np

In [25]:
save_load_path = '/home/eolus/Dropbox/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles'

## Functions

### Tf weighting

In [26]:
def genTfMatrix(dtm_matrix, filename, save_load_path=save_load_path):
    
    # Convert dataframe to numpy array
    dtm_matrix_tf_np = dtm_matrix.as_matrix(columns=None)
    
    fullpath = "{save_load_path}/{filename}".format(save_load_path=save_load_path, filename=filename)
    np.save(fullpath, dtm_matrix_tf_np)

### Binary weighting

In [34]:
def genBinaryMatrix(dtm_matrix, filename, save_load_path=save_load_path):
    
    def binary_weight(dtm_matrix):
        dtm_matrix[dtm_matrix > 1] = 1
        return(dtm_matrix)
    
    dtm_matrix_binary = binary_weight(dtm_matrix)
    dtm_matrix_binary_np = dtm_matrix_binary.as_matrix()
    
    fullpath = "{save_load_path}/{filename}".format(save_load_path=save_load_path, filename=filename)
    np.save(fullpath, dtm_matrix_binary_np)

### Tf-Idf weighting

In [35]:
from sklearn.feature_extraction.text import TfidfTransformer

In [36]:
def genTfidfMatrix(dtm_matrix, filename, save_load_path=save_load_path):
    
    def tfidf_weight(dtm_matrix):
        # L2 here specify the "euclidean distance" (L2-norm)
        tfidf = TfidfTransformer(norm="l2")
        # Attain IDF weigh to each term by applying 'tfidf.fit()'
        tfidf.fit(dtm_matrix)
        # Transform the frequency term matrix into a tf-idf matrix by applying 'tf_idf.transform()'
        dtm_tfidf_matrix = tfidf.transform(dtm_matrix)

        # the output format of tf_idf.transform is a sparse matrix
        # apply '.todense' to make it into dense matrix
        dtm_tfidf_matrix_dense = dtm_tfidf_matrix.todense()

        return(dtm_tfidf_matrix_dense)
    
    dtm_matrix_tfidf = tfidf_weight(dtm_matrix)
    
    fullpath = "{save_load_path}/{filename}".format(save_load_path=save_load_path, filename=filename)
    np.save(fullpath, dtm_matrix_tfidf)

# Convertion factory

In [54]:
output_path = '/home/eolus/Dropbox/MA755 Public/pynotes/Danny-Eole-Yuchen/Pickles/Clustering/Data source'

In [55]:
# Input data
dtm_matrix_df = pd.read_pickle(save_load_path+'/DTM/lyrics_english_thre3_df.pkl')

### Tf weighting

In [56]:
OutputName = 'dtm_tf_thre3'
genTfMatrix(dtm_matrix_df, OutputName, output_path)

### Binary weighting

In [57]:
OutputName = 'dtm_binary_thre3'
genBinaryMatrix(dtm_matrix_df, OutputName, output_path)

### tf_idf weighting

In [58]:
OutputName = 'dtm_tfidf_thre3'
genTfidfMatrix(dtm_matrix_df, OutputName, output_path)

# QA

In [53]:
test_binary_output = np.load(output_path+'/dtm_binary_thre2.npy')
test_binary_output

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [51]:
test_tf_output = np.load(output_path+'/dtm_tf_thre2.npy')
test_tf_output

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [52]:
test_tfidf_output = np.load(output_path+'/dtm_tfidf_thre2.npy')
test_tfidf_output

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])