In [1]:
import pandas as pd
import numpy as np
import scipy 

### Only first 10,000 teas in database

In [2]:
tea_df = pd.read_csv("scraper/clean_data.csv")
tea_df = tea_df[:10000]

In [3]:
reviews = pd.read_csv("./scraper/reviews.csv")
reviews.head()

Unnamed: 0,id,review_id,note_id,author,author_url,ratingValue,description,likes
0,41171,144009,200638,JustJames,JustJames,100,"<span itemprop=""description""> <p>i love this t...",29
1,41171,145216,204669,Cavocorax,cavocorax,97,"<span itemprop=""description""> <p>Sigh. Another...",21
2,41171,146055,235012,VariaTEA,rachel12610,100,"<span itemprop=""description""> <p>Sipdown!! (17...",22
3,41171,146628,204422,Sil,Silaena,87,"<span itemprop=""description""> <p><strong>justj...",23
4,74834,267504,366415,Frankdang7,Frankdang7,100,"<span itemprop=""description""> <p>First oolong ...",0


In [4]:
reviews = reviews[reviews.id.isin(tea_df.id.unique())]
reviews.head()

Unnamed: 0,id,review_id,note_id,author,author_url,ratingValue,description,likes
0,41171,144009,200638,JustJames,JustJames,100,"<span itemprop=""description""> <p>i love this t...",29
1,41171,145216,204669,Cavocorax,cavocorax,97,"<span itemprop=""description""> <p>Sigh. Another...",21
2,41171,146055,235012,VariaTEA,rachel12610,100,"<span itemprop=""description""> <p>Sipdown!! (17...",22
3,41171,146628,204422,Sil,Silaena,87,"<span itemprop=""description""> <p><strong>justj...",23
5,41171,146906,243707,Stephanie,paradigmamnesia,96,"<span itemprop=""description""> <p>I should be s...",24


In [5]:
reviews = reviews[['id', 'author_url']] 
reviews.shape

(26967, 2)

In [14]:
reviews.to_csv("./reviews_top_10k.csv")

### Given a list of tea id

In [6]:
citrus_floral_ids = [8, 37, 38, 52, 80, 84, 126, 332, 356, 363, 463, 519, 572, 599, 640, 675, 723, 745, 788, 806, 818,
                     847, 891, 1023, 1076, 1155, 1158, 1173, 1178, 1198, 1314, 1319, 1416, 1438, 1593, 1644, 1706, 1721,
                     1944, 2036, 2315, 2330, 2388, 2398, 2506, 2559, 2570, 2586, 2597, 2606, 2672, 2684, 2780, 2811, 2813,
                     3032, 3075, 3148, 3214, 3266, 3351, 3379, 3562, 3695, 3892, 3969, 4133, 4151, 4217, 4286, 4357, 4384,
                     4539, 4604, 4627, 4647, 4667, 4775, 4789, 4814, 4843, 4851, 4955, 5161, 5195, 5224, 5239, 5242, 5303,
                     5368, 5495, 5514, 5614, 5751, 5843, 5971, 5994, 6072, 6156, 6236, 6257, 6361, 6517, 6585, 6600, 6731,
                     6778, 6822, 6898, 6930, 6964, 6981, 7111, 7183, 7240, 7372, 7529, 7679, 7683, 7702, 7758, 7880, 8015,
                     8143, 8238, 8241, 8318, 8454, 8541, 8548, 8604, 8627, 8637, 8664, 8829, 9108, 9296, 9338, 9360, 9392,
                     9416, 9444, 9537, 9736, 9818, 9853, 9862, 9907, 9934, 9935, 10028, 10055, 10062, 10160, 10170, 10215,
                     10303, 10423, 10424, 10443, 10552, 10615, 10798, 10857, 10863, 11003, 11029, 11257, 11325, 11430,
                     11434, 11489, 11575, 11667, 11824, 11837, 11892, 11899, 11953, 11954, 12102, 12105, 12140, 12192,
                     12193, 12276, 12389, 12561, 12597, 12817, 12839, 12925, 12954, 12966, 13073, 13222, 13326, 13407,
                     13424, 13506, 13518, 13544, 13661, 13683, 13751, 13830, 13845, 13862, 13959, 13961, 13964, 14194,
                     14207, 14306, 14408, 14429, 14431, 14564, 14665, 14683, 14711, 14732, 14761, 15055, 15068, 15107,
                     15180, 15343, 15392, 15408, 15414, 15481, 15503]

### Get reviews for these teas

In [7]:
results = reviews[reviews.id.isin(citrus_floral_ids)]
results.shape

(125, 2)

### Create graph for these teas and their authors

In [8]:
import networkx as nx
G = nx.from_pandas_edgelist(results, 'author_url', 'id', create_using=nx.DiGraph())

In [9]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,5))
nx.draw_networkx(G, node_size=10, font_size=9, with_labels=True)

### Run HITS algo to get teas ranked

In [10]:
h, a = nx.hits(G, max_iter=10000)

In [11]:
top_teas = {teaid: a[teaid] for teaid in results.id.unique()}

In [12]:
top_teas

{37: 0.252594032977153,
 3351: 0.0005905276545525636,
 3379: 0.34190864021535794,
 4627: 0.15668495015373735,
 4814: 0.08038839525476649,
 9862: 0.13298184489241885,
 11489: 0.025418583299467053,
 12102: 0.00943302555254677,
 13661: 0.0}

In [13]:
sorted(top_teas, key=top_teas.get, reverse=True)

[3379, 37, 4627, 9862, 4814, 11489, 12102, 3351, 13661]