In [1]:
__author__ = 'Congrui_Li'

In [2]:
import numpy as np
import scipy as sp
import pandas as pd

In [3]:
sp.__version__

'1.0.0'

In [4]:
data = np.genfromtxt('Min_USA.csv', dtype=float, delimiter=',', skip_header=1) 

In [5]:
data.shape

(82866, 2473)

In [6]:
import csv
with open('Min_USA.csv', 'rb') as f:
    col_names = next(csv.reader(f))

In [7]:
len(col_names)

2473

In [9]:
from scipy import sparse

In [10]:
data_sparse = sparse.csc_matrix(data)

In [11]:
# efficient Jaccard similarity calculation
def jaccard_similarities(mat):
    cols_sum = mat.getnnz(axis=0)
    ab = mat.T * mat

    # for rows
    aa = np.repeat(cols_sum, ab.getnnz(axis=0))
    # for columns
    bb = cols_sum[ab.indices]

    similarities = ab.copy()
    similarities.data /= (aa + bb - ab.data)

    return similarities

In [12]:
jaccard_mat = jaccard_similarities(data_sparse)

In [13]:
# turn it back into numpy array
jaccard_arr = jaccard_mat.A

In [14]:
# turn it into pandas dataframe with row and col names
jaccard_df = pd.DataFrame(jaccard_arr, index=col_names, columns=col_names)

In [15]:
jaccard_df.shape

(2473, 2473)

In [16]:
jaccard_df.to_csv("jaccard_similarities.csv")

In [17]:
rst_col_names = []
rst_col_names.append("mineral")

for i in range(10): 
    i += 1
    rst_col_names.append("mineral_%d" % i)
    rst_col_names.append("similarity_%d" % i)

print rst_col_names   

['mineral', 'mineral_1', 'similarity_1', 'mineral_2', 'similarity_2', 'mineral_3', 'similarity_3', 'mineral_4', 'similarity_4', 'mineral_5', 'similarity_5', 'mineral_6', 'similarity_6', 'mineral_7', 'similarity_7', 'mineral_8', 'similarity_8', 'mineral_9', 'similarity_9', 'mineral_10', 'similarity_10']


In [18]:
with open("jaccard_bottom10.csv", "w") as jb10_csv:
    jb10_csv.write(",".join(rst_col_names) + "\n")
    for i in range(jaccard_arr.shape[0]):
        sorted_row = np.argsort(jaccard_arr[i,:])
        cur_row = []
        cur_row.append(col_names[i])
        for j in range(10):
            cur_ind = sorted_row[j] 
            if cur_ind == i:
                print("warning!!! same mineral in bottom 10!!!")            
            cur_row.append( col_names[cur_ind] )
            cur_row.append( str(jaccard_arr[i,cur_ind]) )

        jb10_csv.write(",".join(cur_row) + "\n")
        
jb10_csv.close()             

In [None]:
with open("jaccard_top10.csv", "w") as jt10_csv:
    jt10_csv.write(",".join(rst_col_names) + "\n")
    for i in range(jaccard_arr.shape[0]):
        sorted_row = np.argsort(jaccard_arr[i,:])[::-1]
        cur_row = []
        cur_row.append(col_names[i])
        for j in range(11):
            cur_ind = sorted_row[j]
            # the similarity between a mineral and itself would always be the largest, which is 1.0, skip this situation
            if cur_ind == i:
                continue
            cur_row.append( col_names[cur_ind] )
            cur_row.append( str(jaccard_arr[i,cur_ind]) )
            
        cur_row = cur_row[:21]
        jt10_csv.write(",".join(cur_row) + "\n")
        
jt10_csv.close() 

In [None]:
import sklearn.preprocessing as pp

In [None]:
# efficient cosine similarity calculation
def cosine_similarities(mat):
    col_normed_mat = pp.normalize(mat.tocsc(), axis=0)
    return col_normed_mat.T * col_normed_mat

In [None]:
cosine_mat = cosine_similarities(data_sparse)

In [None]:
# turn it back into numpy array
cosine_arr = cosine_mat.A

In [None]:
# turn it into pandas dataframe with row and col names
cosine_df = pd.DataFrame(cosine_arr, index=col_names, columns=col_names)

In [None]:
cosine_df.shape

In [None]:
cosine_df.to_csv("cosine_similarities.csv")

In [None]:
with open("cosine_bottom10.csv", "w") as cb10_csv:
    cb10_csv.write(",".join(rst_col_names) + "\n")
    for i in range(cosine_arr.shape[0]):
        sorted_row = np.argsort(cosine_arr[i,:])
        cur_row = []
        cur_row.append(col_names[i])
        for j in range(10):
            cur_ind = sorted_row[j] 
            if cur_ind == i:
                print("warning!!! same mineral in bottom 10!!!")            
            cur_row.append( col_names[cur_ind] )
            cur_row.append( str(cosine_arr[i,cur_ind]) )

        cb10_csv.write(",".join(cur_row) + "\n")
        
cb10_csv.close() 

In [None]:
with open("cosine_top10.csv", "w") as ct10_csv:
    ct10_csv.write(",".join(rst_col_names) + "\n")
    for i in range(cosine_arr.shape[0]):
        sorted_row = np.argsort(cosine_arr[i,:])[::-1]
        cur_row = []
        cur_row.append(col_names[i])
        for j in range(11):
            cur_ind = sorted_row[j]
            # the similarity between a mineral and itself would always be the largest, which is 1.0, skip this situation
            if cur_ind == i:
                continue
            cur_row.append( col_names[cur_ind] )
            cur_row.append( str(cosine_arr[i,cur_ind]) )
            
        cur_row = cur_row[:21]
        ct10_csv.write(",".join(cur_row) + "\n")
        
ct10_csv.close() 