In [105]:
from postgres import PostGresClient
import pandas as pd
import pandas.io.sql as psql
import numpy as np
import time
from sklearn.metrics.pairwise import cosine_similarity


In [106]:
#loading in data
df = PostGresClient().load_table('flavor_freq')
    



In [107]:
df.head()
print(df.shape[0])

1663243


In [80]:
#create a term freq matrix 

def create_tf(df):

    strt = time.time()
    #create the df where rows are wine_id and columns are keywords
    words = df.keyword_name.unique()
    wines = df.wine_id.unique()

    #creating total keywords per wine
    num_words_df = df[['wine_id', 'keyword_count']].groupby('wine_id').sum('keyword_count')

    tf = pd.DataFrame(0, columns=words, index = wines)
    for i in range(df.shape[0]):
        #finding index of wine_id  getting keyword and count
        idx_num = tf.index.get_loc(df.wine_id[i])
        kywrd = df.keyword_name[i]
        cnt = df.keyword_count[i]
        num_wrds = num_words_df[num_words_df.index == df.wine_id[i]].keyword_count #getting the total keywords

        #updating record in table
        tf[kywrd].iloc[idx_num] = cnt / num_wrds

    end = time.time()
    print(f"----- Processing took {end - strt} seconds to complete -----")
    return tf





In [81]:
tf = create_tf(df)
tf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tf[kywrd].iloc[idx_num] = cnt / num_wrds


----- Processing took 514.4875190258026 seconds to complete -----


Unnamed: 0,oak,tobacco,vanilla,chocolate,coffee,clove,butter,nutmeg,cigar,dark chocolate,...,incense smoke,dried chili pepper,mushroom broth,wood ear,dried red chili,gingersnap,chayote squash,peppercress,lavage,true cinnamon
51644,0.169082,0.086957,0.057971,0.024155,0.014493,0.009662,0.004831,0.004831,0.004831,0.004831,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128181,0.059925,0.018727,0.026217,0.007491,0.003745,0.007491,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6417379,0.069767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73249,0.064257,0.012048,0.032129,0.006693,0.005355,0.006693,0.005355,0.002677,0.002677,0.002677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128885,0.080808,0.0,0.070707,0.020202,0.0,0.0,0.0,0.010101,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
#saving tf matrix to csv as it takes roughly 10 minutes to create
tf.to_csv('tf.csv')

In [109]:
tf = pd.read_csv('tf.csv', index_col= 0)
tf.head()


Unnamed: 0,oak,tobacco,vanilla,chocolate,coffee,clove,butter,nutmeg,cigar,dark chocolate,...,incense smoke,dried chili pepper,mushroom broth,wood ear,dried red chili,gingersnap,chayote squash,peppercress,lavage,true cinnamon
51644,0.169082,0.086957,0.057971,0.024155,0.014493,0.009662,0.004831,0.004831,0.004831,0.004831,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128181,0.059925,0.018727,0.026217,0.007491,0.003745,0.007491,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6417379,0.069767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73249,0.064257,0.012048,0.032129,0.006693,0.005355,0.006693,0.005355,0.002677,0.002677,0.002677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128885,0.080808,0.0,0.070707,0.020202,0.0,0.0,0.0,0.010101,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
#create idf matrix

def create_idf(df):
    #getting number of wines
    n = len(df.wine_id.unique()) + 1
    idf =  np.log(n / (df.keyword_name.value_counts() + 1))

    return idf


In [111]:
idf = create_idf(df)
idf.head()


oak         0.158418
vanilla     0.214867
minerals    0.281567
earthy      0.296888
cherry      0.344724
Name: keyword_name, dtype: float64

In [112]:
def create_tfidf(tf, idf):
    #TODO: make this a wrapper for tf and idf
    tfidf = pd.DataFrame(0, columns = tf.columns, index = tf.index)
    for kywrd in tf.columns:
        tfidf[kywrd] = tf[kywrd] * idf[idf.index.get_loc(kywrd)]       

    return tfidf
   
    

In [113]:
tfidf = create_tfidf(tf, idf)
tfidf

Unnamed: 0,oak,tobacco,vanilla,chocolate,coffee,clove,butter,nutmeg,cigar,dark chocolate,...,incense smoke,dried chili pepper,mushroom broth,wood ear,dried red chili,gingersnap,chayote squash,peppercress,lavage,true cinnamon
51644,0.026786,0.050504,0.012456,0.013195,0.013007,0.008036,0.002953,0.005663,0.006772,0.004839,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128181,0.009493,0.010876,0.005633,0.004092,0.003361,0.006230,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6417379,0.011052,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
73249,0.010179,0.006998,0.006903,0.003656,0.004806,0.005567,0.003273,0.003138,0.003753,0.002682,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1128885,0.012801,0.000000,0.015193,0.011036,0.000000,0.000000,0.000000,0.011840,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8786790,0.000000,0.024715,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8025656,0.013482,0.000000,0.000000,0.011623,0.000000,0.017696,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2302584,0.010237,0.018766,0.006248,0.005295,0.004350,0.001344,0.000000,0.000000,0.002265,0.006473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15262,0.015331,0.006245,0.003466,0.011748,0.009650,0.000000,0.003286,0.000000,0.000000,0.005386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
cs = pd.DataFrame(cosine_similarity(tfidf, dense_output=True))

In [115]:
cs.index = tfidf.index
cs.columns = tfidf.index
cs

Unnamed: 0,51644,1128181,6417379,73249,1128885,3563883,1084371,779176,779175,4835041,...,2385130,1134453,2562687,1224965,2542577,8786790,8025656,2302584,15262,1506079
51644,1.000000,0.453931,0.253777,0.376446,0.335096,0.071400,0.384933,0.267226,0.241231,0.035974,...,0.282608,0.516596,0.330097,0.343006,0.011260,0.201648,0.186145,0.533851,0.337019,0.274093
1128181,0.453931,1.000000,0.431530,0.756576,0.640563,0.058748,0.674398,0.504231,0.537224,0.054005,...,0.166251,0.628765,0.533982,0.446822,0.038131,0.449636,0.371913,0.653716,0.555082,0.382344
6417379,0.253777,0.431530,1.000000,0.575664,0.437202,0.052397,0.401979,0.404022,0.470401,0.076793,...,0.222870,0.496199,0.493268,0.312466,0.141049,0.336451,0.294164,0.485971,0.426813,0.458872
73249,0.376446,0.756576,0.575664,1.000000,0.622998,0.091967,0.679018,0.606668,0.700385,0.051559,...,0.126742,0.683121,0.649440,0.420975,0.044486,0.461714,0.450559,0.724395,0.636603,0.378262
1128885,0.335096,0.640563,0.437202,0.622998,1.000000,0.079906,0.564368,0.439210,0.524535,0.027731,...,0.192742,0.561335,0.498998,0.342563,0.028323,0.295882,0.338015,0.610212,0.519960,0.363667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8786790,0.201648,0.449636,0.336451,0.461714,0.295882,0.038543,0.366205,0.330587,0.310306,0.023996,...,0.105660,0.365931,0.296940,0.261502,0.055122,1.000000,0.172964,0.345889,0.256663,0.138210
8025656,0.186145,0.371913,0.294164,0.450559,0.338015,0.041739,0.329125,0.293833,0.376994,0.071109,...,0.042518,0.363892,0.387735,0.179351,0.030051,0.172964,1.000000,0.397053,0.427260,0.298875
2302584,0.533851,0.653716,0.485971,0.724395,0.610212,0.049613,0.660956,0.487533,0.621882,0.040474,...,0.174547,0.814423,0.603236,0.496691,0.036078,0.345889,0.397053,1.000000,0.683631,0.535505
15262,0.337019,0.555082,0.426813,0.636603,0.519960,0.079046,0.479265,0.404538,0.659427,0.041342,...,0.121800,0.671526,0.569322,0.340979,0.017924,0.256663,0.427260,0.683631,1.000000,0.487468


In [120]:
cs[1513319].sort_values(ascending = False)

1513319    1.000000
1297856    0.634110
3984599    0.624566
17530      0.621993
1316060    0.616782
             ...   
6477445    0.000000
3355487    0.000000
5902918    0.000000
2122236    0.000000
9977724    0.000000
Name: 1513319, Length: 27281, dtype: float64