In [1]:
import numpy as np
np.set_printoptions(linewidth=200)
import pandas as pd
pd.set_option('display.width', 250)

from sklearn.feature_extraction.text import CountVectorizer
import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from tqdm import tqdm
import itertools
from sklearn.feature_extraction.text import TfidfTransformer

import pickle

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davenovelli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# read json into a dataframe
df_idf = pd.read_csv('./data/text_df.csv')

# print schema
print("Schema:\n", df_idf.dtypes)
print("\nNumber of articles, columns=", df_idf.shape)

Schema:
 article_id       int64
publisher       object
url             object
createdAt       object
title           object
subtitle        object
text_content    object
dtype: object

Number of articles, columns= (58971, 7)


In [3]:
df_idf.publisher.unique()

array(['surfline.com', 'theinertia', 'surfer.com', 'magicseaweed.com', 'surfd.com', 'John John Florence', 'Kai Lenny', 'The Surfers Journal', 'Nathan Florence', 'Koa Rothman', 'FatSand Films',
       'SURFER', 'World Surf League', 'Los Angeles Times', 'itsnicethat.com', 'DAN PALERMO', 'Nitya Nair', 'swellinfo', 'stabmag'], dtype=object)

In [45]:
# ARTICLE_COUNT = 500

def pre_process(text):    
#     print(text)
    # Manual replacements to avoid wiping out certain keywords
    text = text.replace('J.O.B', 'JOB')
    
    # lowercase
    text = text.lower()
    
    #remove tags
    text = re.sub("", "", text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

df_idf = (
    df_idf
    .query('text_content.notna() & text_content.str.len() > 1000')
    .assign(text=lambda df: df.title.fillna('') + ' ' + df.subtitle.fillna('') + ' ' + df.text_content)
    .assign(text=lambda df: df.text.str.strip().apply(pre_process))
)

df_idf

Unnamed: 0,article_id,publisher,url,createdAt,title,subtitle,text_content,text
0,63577,surfline.com,https://www.surfline.com/surf-news/j-o-b-soft-...,2021-05-11 23:23:43,J.O.B. Has a Soft Side,"It’s called the Red Bull Foam Wreckers, and it...",Few people have so deftly navigated every surf...,job has a soft side it s called the red bull f...
1,63575,surfline.com,https://www.surfline.com/surf-news/wave-day-ma...,2021-05-11 23:23:42,"Wave of the Day: Matahi Drollett, Teahupoo, Ma...",Tahitian local sticks late drop and gets spit ...,Vid: Tim McKenna. “Although Tahiti scored a de...,wave of the day matahi drollett teahupoo may t...
2,63576,surfline.com,https://www.surfline.com/surf-news/two-new-edi...,2021-05-11 23:23:42,"Two New Edits: Morocco is a Dredging, Right Tu...",Ramzi Boukhiam and Natxo Gonzalez just dropped...,Vid: Mustaphacola and Nadir LM. From a surfing...,two new edits morocco is a dredging right tube...
3,63574,theinertia,https://www.theinertia.com/surf/14-year-old-su...,2021-05-11 22:14:38,14-Year-Old Surfer Bitten by Shark Has Attitud...,,Parker Blanchette was bitten by a shark at Nor...,year old surfer bitten by shark has attitude ...
4,63573,theinertia,https://www.theinertia.com/surf/pro-surfer-los...,2021-05-11 22:14:31,Pro Surfer Loses Sponsor After Session Without...,,"Without his videographer, how would timeless l...",pro surfer loses sponsor after session without...
...,...,...,...,...,...,...,...,...
58966,23560,theinertia,https://www.theinertia.com/travel/surfing-cent...,2010-12-13 07:23:41,In Search of Libertad,,Two surfers enjoy (and contribute to) Nicaragu...,in search of libertad two surfers enjoy and co...
58967,23561,theinertia,https://www.theinertia.com/travel/scotland-isl...,2010-12-09 07:23:41,Whisky Wins,,By the time I reached the thin beach trail the...,whisky wins by the time i reached the thin bea...
58968,23562,theinertia,https://www.theinertia.com/travel/inhotim-a-mu...,2010-11-17 07:23:41,Inhotim: A Museum Unlike Any Other,,The granite stone that looks like a perfect pe...,inhotim a museum unlike any other the granite ...
58969,23563,theinertia,https://www.theinertia.com/travel/under-a-fata...,2010-11-13 07:24:03,"Under a Fatal Sun, Pt. 6: New South Wales",,"The Morning of The Earth farmhouse, circa 2000...",under a fatal sun pt new south wales the morni...


In [5]:
def get_stop_words():
    """ Load stop words from NLTK and include any manually assigned words to skip """
    additional_stopwords = ['surfing']
    
    return frozenset(stopwords.words('english') + additional_stopwords)

# load a set of stop words
stopword_set = get_stop_words()

# get the text column 
docs = df_idf['text'].tolist()

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

def process_results(df_idf, cv, tfidf_transformer, max_df, min_df, max_features):
    # you only needs to do this once, this is a mapping of index to 
    feature_names = cv.get_feature_names()
    
    frames = []
    for i in range(df_idf.shape[0]):
        # get the article_id
        article_id = df_idf.article_id.iloc[-i]
        
        # get the url
        url = df_idf.url.iloc[-i]

        # get the document that we want to extract keywords from
        doc = df_idf.text.iloc[-i]

        #generate tf-idf for the given document
        tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))

        #sort the tf-idf vectors by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())

        #extract only the top n; n here is 20
        keywords = extract_topn_from_vector(feature_names, sorted_items, 20)
#         print(keywords)
        
        keywords_df = (
            pd.DataFrame(keywords.items(), columns=['keyword','tfidf_weight'])
            .assign(article_id=article_id)
            .assign(url=url)
            .assign(max_df=max_df)
            .assign(min_df=min_df)
            .assign(max_features=max_features)
            .assign(rank=lambda df: df.groupby(['url']).cumcount() + 1)
        )
#         print(keywords_df)
        
        frames += [keywords_df]
    
    df = pd.concat(frames, ignore_index=True)
#     print(df)
    
    return df

In [6]:
# max_values = [.7, .8, .9]  #[.7, .75, .8, .85, .9]
# min_values = [.1, .05, .01, .005, .001]
# max_features = [8000, 13000, 21000]
# test_articles = df_idf.groupby('publisher').head(3).reset_index()

# params = list(itertools.product(*[max_values, min_values, max_features]))
# print(f"len(params): {len(params)}")

# params_frames = []
# for max_df, min_df, features in tqdm(params):
# #     print(f"processing options: ({max_df}{min_df}{features})")
    
#     # create a vocabulary of words, 
#     # ignore words that appear in X% of documents, 
#     # eliminate stop words
#     cv = CountVectorizer(max_df=max_df, min_df=min_df, stop_words=stopword_set, ngram_range=(1, 3), max_features=features)
#     word_count_vector = cv.fit_transform(docs)
    
#     tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
#     tfidf_transformer.fit(word_count_vector)

#     results = process_results(test_articles, cv, tfidf_transformer, max_df, min_df, features)
#     params_frames += [results]

# params_df = (
#     pd.concat(params_frames, ignore_index=True)
#     .assign(params=lambda df: df.max_features.astype(str) + '_' + df.min_df.astype(str) + '_' + df.max_df.astype(str))
# )
# params_df.to_csv('paramsdf.csv', index=False)
# print(params_df)

  0%|          | 0/45 [00:00<?, ?it/s]

len(params): 45


100%|██████████| 45/45 [1:48:34<00:00, 144.78s/it]


                keyword  tfidf_weight  article_id                                                url  max_df  min_df  max_features  rank           params
0                   fun         0.420       63577  https://www.surfline.com/surf-news/j-o-b-soft-...     0.7   0.100          8000     1     8000_0.1_0.7
1                people         0.255       63577  https://www.surfline.com/surf-news/j-o-b-soft-...     0.7   0.100          8000     2     8000_0.1_0.7
2                  surf         0.237       63577  https://www.surfline.com/surf-news/j-o-b-soft-...     0.7   0.100          8000     3     8000_0.1_0.7
3                 event         0.227       63577  https://www.surfline.com/surf-news/j-o-b-soft-...     0.7   0.100          8000     4     8000_0.1_0.7
4               contest         0.225       63577  https://www.surfline.com/surf-news/j-o-b-soft-...     0.7   0.100          8000     5     8000_0.1_0.7
...                 ...           ...         ...                           

In [7]:
params_df = pd.read_csv('paramsdf.csv')

# Convert to a format amenable to manual scoring of keywords

### Each block of 20 rows is from the same article - 10 total articles

In [8]:
from functools import reduce

partials = []
for params in params_df.sort_values(['max_features', 'max_df', 'min_df']).params.unique():
    currparams = (
        params_df
        .query('params == @params')
        .filter(items=['url', 'rank', 'params', 'keyword', 'tfidf_weight'])
        .rename(columns={'keyword': f'keyword__{params}', 'tfidf_weight': f'tfidf_weight__{params}'})
        .drop('params', axis=1)
    )
#     print(currparams)
    
    partials += [currparams]

results = (
    reduce(lambda x, y: pd.merge(x, y, how='outer', on=['url', 'rank']), partials)
    .sort_values(['url', 'rank'])
    .set_index(['url', 'rank'])
)
results.columns = pd.MultiIndex.from_tuples([tuple(reversed(c.split("__"))) for c in results.columns])
results.to_csv('results.csv')
results

Unnamed: 0_level_0,Unnamed: 1_level_0,8000_0.001_0.7,8000_0.001_0.7,8000_0.005_0.7,8000_0.005_0.7,8000_0.01_0.7,8000_0.01_0.7,8000_0.05_0.7,8000_0.05_0.7,8000_0.1_0.7,8000_0.1_0.7,...,21000_0.001_0.9,21000_0.001_0.9,21000_0.005_0.9,21000_0.005_0.9,21000_0.01_0.9,21000_0.01_0.9,21000_0.05_0.9,21000_0.05_0.9,21000_0.1_0.9,21000_0.1_0.9
Unnamed: 0_level_1,Unnamed: 1_level_1,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight,...,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight,keyword,tfidf_weight
url,rank,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,1,country,0.373,country,0.367,country,0.409,country,0.529,waves,0.343,...,cuba,0.654,country,0.367,country,0.409,country,0.529,waves,0.343
https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,2,caribbean,0.202,caribbean,0.199,caribbean,0.221,waves,0.231,surf,0.313,...,country,0.250,caribbean,0.199,caribbean,0.221,waves,0.231,surf,0.313
https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,3,reefs,0.183,local surfer,0.189,reefs,0.201,surf,0.211,little,0.271,...,caribbean,0.135,local surfer,0.189,reefs,0.201,surf,0.211,little,0.271
https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,4,waves,0.163,reefs,0.180,waves,0.179,area,0.189,local,0.270,...,local surfer,0.128,reefs,0.180,waves,0.179,area,0.189,local,0.270
https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,5,surf,0.149,waves,0.160,surf,0.163,little,0.183,old,0.221,...,reefs,0.123,waves,0.160,surf,0.163,little,0.183,old,0.221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://www.youtube.com/watch?v=xzLowIj-DuA,16,champ,0.178,champ,0.178,series,0.177,,,,,...,wins,0.173,champ,0.178,series,0.177,,,,
https://www.youtube.com/watch?v=xzLowIj-DuA,17,massive,0.170,massive,0.170,six,0.174,,,,,...,champ,0.169,massive,0.170,six,0.174,,,,
https://www.youtube.com/watch?v=xzLowIj-DuA,18,sport,0.157,sport,0.157,north,0.145,,,,,...,massive,0.161,sport,0.157,north,0.145,,,,
https://www.youtube.com/watch?v=xzLowIj-DuA,19,series,0.157,series,0.157,part,0.143,,,,,...,sport,0.149,series,0.157,part,0.143,,,,


In [9]:
adf = (
    reduce(lambda x, y: pd.merge(x, y, how='outer', on=['url', 'rank']), partials)
    .sort_values(['url', 'rank'])
)
cols = [c for c in adf.columns.values if 'keyword__' in c]

urls = adf.url.unique()
seriess = []
for url in urls:
    urldf = (
        adf.query('url == @url')
        .filter(items=cols)
    )
#     print(urldf)
    
    url_keywords = pd.Series(pd.unique(urldf[cols].values.ravel('K'))).dropna().sort_values()
    print(f"{url}:\n{url_keywords.tolist()}\n")
    seriess += [url_keywords]
    
keywords_df = pd.concat(seriess, axis=1, keys=urls)
keywords_df.to_csv('urlkeywords.csv', index=False)
keywords_df

https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/:
['area', 'average', 'boards', 'boulders', 'caribbean', 'city', 'coastline', 'country', 'cuba', 'different', 'difficult', 'expedition', 'find', 'frank', 'fun', 'get', 'go', 'gonzalez', 'good', 'head', 'la', 'little', 'local', 'local surfer', 'locals', 'many', 'mentawais', 'near', 'old', 'one best surfers', 'people', 'quality', 'quality waves', 'realise', 'really', 'reefs', 'something different', 'spot', 'spots', 'surf', 'surfer', 'take', 'think', 'trip', 'waves']

https://magicseaweed.com/news/surfing-the-alps-how-the-landlocked-swiss-surf-team-are-raising-their-game/12377/:
['air', 'air game', 'air section', 'also', 'bay', 'coach', 'country', 'culture', 'element', 'everything', 'facility', 'game', 'games', 'improvement', 'isa', 'isa world', 'isa world games', 'landlocked', 'like', 'love', 'might', 'much', 'national', 'people', 'placed', 'pool', 'rachel', 'rd', 'right', 'river', 'said', 'section', 'se

Unnamed: 0,https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,https://magicseaweed.com/news/surfing-the-alps-how-the-landlocked-swiss-surf-team-are-raising-their-game/12377/,https://magicseaweed.com/news/the-miraculous-circumstances-that-makes-mundaka-pump/12382/,https://stabmag.com/news/finally-a-sensical-and-supportive-womens-big-wave-event,https://stabmag.com/news/leaked-wsl-memo-we-have-confidence-in-the-resumption-of-the-pipe-masters,https://stabmag.com/news/wsl-suspends-pipe-masters-after-staffers-contract-coronavirus,https://surfd.com/2021/03/10-of-the-best-surf-hats/,https://surfd.com/2021/03/a-collection-of-the-best-surf-documentaries-ever-made/,https://surfd.com/2021/04/lip-sunglasses-review/,https://www.itsnicethat.com/articles/afrosurf-publication-180321,...,https://www.youtube.com/watch?v=JrrdFeGCjmE,https://www.youtube.com/watch?v=NAjHmCnS3-o,https://www.youtube.com/watch?v=RI3xeLLxHiw,https://www.youtube.com/watch?v=V8UYsPRzjSA,https://www.youtube.com/watch?v=VoMEDcXBzik,https://www.youtube.com/watch?v=WWBSP__lYjkhttps://www.youtube.com/watch?v=WWBSP__lYjk,https://www.youtube.com/watch?v=XTq1gufrm18,https://www.youtube.com/watch?v=a3djTtlH2OM,https://www.youtube.com/watch?v=n3NS9bPa5qY,https://www.youtube.com/watch?v=xzLowIj-DuA
0,country,team,mundaka,big wave,wsl,wsl,hat,film,sunglasses,stories,...,youtube,point,spine,soul,rocky,gopro,billy,instagram,john,big wave
1,caribbean,training,bar,big,pipe masters,logan,surf,frank,lip,continent,...,welcome,righthand,surgery,india,pipeline,foot wave,billy kemper,subscribe,john john,morocco
2,reefs,facility,swells,wave event,masters,safety,protection,surf,typhoon,sees,...,channel,thing ever,dealing,stacked,john,puerto escondido,kemper,follow instagram,short film,adrenaline
3,waves,bay,sandbar,xl,internal,covid,cap,jordy,lenses,african,...,due,describes,neck,surf team,subscribe,escondido,jaws,legends,feat,north atlantic
4,surf,world games,waves,wave world tour,pipe,ceo,features,one,lens,produced,...,learn,fickle,recovery,youtube,point,puerto,goal,insane,north shore oahu,chapter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,,,,wave conditions,,,,,,,...,,,,,,,,,,
68,,,,event even,,,,,,,...,,,,,,,,,,
69,,,,every winter,,,,,,,...,,,,,,,,,,
70,,,,professional career,,,,,,,...,,,,,,,,,,


## Read in the term scores to identify best hyperparams

In [10]:
keywords = (
    pd.read_csv('urlkeywords_scored_20210604.csv')
    .dropna(axis='columns', how='all').head()
)
keywords.head()

Unnamed: 0,https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/,Unnamed: 1,https://magicseaweed.com/news/surfing-the-alps-how-the-landlocked-swiss-surf-team-are-raising-their-game/12377/,Unnamed: 4,https://magicseaweed.com/news/the-miraculous-circumstances-that-makes-mundaka-pump/12382/,.2,https://stabmag.com/news/finally-a-sensical-and-supportive-womens-big-wave-event,.3,https://stabmag.com/news/leaked-wsl-memo-we-have-confidence-in-the-resumption-of-the-pipe-masters,.4,...,https://www.youtube.com/watch?v=WWBSP__lYjkhttps://www.youtube.com/watch?v=WWBSP__lYjk,Unnamed: 70,https://www.youtube.com/watch?v=XTq1gufrm18,Unnamed: 72,https://www.youtube.com/watch?v=a3djTtlH2OM,Unnamed: 74,https://www.youtube.com/watch?v=n3NS9bPa5qY,Unnamed: 76,https://www.youtube.com/watch?v=xzLowIj-DuA,Unnamed: 78
0,cuba,1.0,isa world games,1.0,mundaka,1.0,big wave,1.0,wsl,1.0,...,caught inside,1.0,billy kemper,1.0,first light,1.0,john john florence,1.0,morocco,1.0
1,caribbean,1.0,team,1.0,swells,1.0,big,1.0,pipe masters,1.0,...,puerto escondido,1.0,billy,1.0,mexico,1.0,north shore oahu,1.0,billy,1.0
2,gonzalez,1.0,training,1.0,sandbar,1.0,wave event,1.0,masters,1.0,...,wave,1.0,kemper,1.0,session,1.0,john john,1.0,big wave,1.0
3,frank,1.0,facility,1.0,waves,1.0,xl,1.0,pipe,1.0,...,mexico,1.0,jaws,1.0,,,florence,1.0,wave,1.0
4,local surfer,1.0,world games,1.0,shape,1.0,wave world tour,1.0,officials,1.0,...,escondido,1.0,ultimate goal,1.0,insane,0.0,north shore,1.0,swell,1.0


In [11]:
frames = []
for i in range(int(keywords.shape[1] / 2)):
    article = keywords.iloc[:, 2*i:2*i+2]
    url = article.columns.values[0]
    print(f"Article #{i}: {url}")
    
    article.columns = ['keyword', 'weight']
    article = article.assign(url=url)
    frames += [article]
    
keywords = pd.concat(frames).set_index('url').reset_index()
keywords

Article #0: https://magicseaweed.com/news/gallery-a-surf-expedition-through-untouched-cuba/12380/
Article #1: https://magicseaweed.com/news/surfing-the-alps-how-the-landlocked-swiss-surf-team-are-raising-their-game/12377/
Article #2: https://magicseaweed.com/news/the-miraculous-circumstances-that-makes-mundaka-pump/12382/
Article #3: https://stabmag.com/news/finally-a-sensical-and-supportive-womens-big-wave-event
Article #4: https://stabmag.com/news/leaked-wsl-memo-we-have-confidence-in-the-resumption-of-the-pipe-masters
Article #5: https://stabmag.com/news/wsl-suspends-pipe-masters-after-staffers-contract-coronavirus
Article #6: https://surfd.com/2021/03/10-of-the-best-surf-hats/
Article #7: https://surfd.com/2021/03/a-collection-of-the-best-surf-documentaries-ever-made/
Article #8: https://surfd.com/2021/04/lip-sunglasses-review/
Article #9: https://www.itsnicethat.com/articles/afrosurf-publication-180321
Article #10: https://www.latimes.com/lifestyle/story/2021-03-26/manhattan-beach

Unnamed: 0,url,keyword,weight
0,https://magicseaweed.com/news/gallery-a-surf-e...,cuba,1.0
1,https://magicseaweed.com/news/gallery-a-surf-e...,caribbean,1.0
2,https://magicseaweed.com/news/gallery-a-surf-e...,gonzalez,1.0
3,https://magicseaweed.com/news/gallery-a-surf-e...,frank,1.0
4,https://magicseaweed.com/news/gallery-a-surf-e...,local surfer,1.0
...,...,...,...
165,https://www.youtube.com/watch?v=xzLowIj-DuA,morocco,1.0
166,https://www.youtube.com/watch?v=xzLowIj-DuA,billy,1.0
167,https://www.youtube.com/watch?v=xzLowIj-DuA,big wave,1.0
168,https://www.youtube.com/watch?v=xzLowIj-DuA,wave,1.0


In [12]:
scored = params_df.merge(keywords, how='left', on=['url', 'keyword']).rename(columns={'combo': 'params'})
scored['weight'] = scored['weight'].fillna(0)
scored

Unnamed: 0,keyword,tfidf_weight,article_id,url,max_df,min_df,max_features,rank,params,weight
0,fun,0.420,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.100,8000,1,8000_0.1_0.7,0.0
1,people,0.255,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.100,8000,2,8000_0.1_0.7,0.0
2,surf,0.237,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.100,8000,3,8000_0.1_0.7,0.0
3,event,0.227,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.100,8000,4,8000_0.1_0.7,0.0
4,contest,0.225,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.100,8000,5,8000_0.1_0.7,0.0
...,...,...,...,...,...,...,...,...,...,...
24664,late drop,0.118,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.9,0.001,21000,16,21000_0.001_0.9,0.0
24665,view live,0.117,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.9,0.001,21000,17,21000_0.001_0.9,0.0
24666,clean conditions,0.116,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.9,0.001,21000,18,21000_0.001_0.9,0.0
24667,usual suspects,0.115,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.9,0.001,21000,19,21000_0.001_0.9,0.0


In [13]:
results = (
    scored
    .groupby(['url', 'params'])
    .weight.sum()
    .reset_index()
    .rename(columns={'weight': 'score'})
    .groupby('params')
    .sum()
    .sort_values(['score', 'params'], ascending=[False, True])
    .reset_index()
)
results

Unnamed: 0,params,score
0,21000_0.001_0.7,157.0
1,21000_0.001_0.8,157.0
2,21000_0.001_0.9,157.0
3,13000_0.001_0.7,151.0
4,13000_0.001_0.8,151.0
5,13000_0.001_0.9,151.0
6,8000_0.001_0.7,141.0
7,8000_0.001_0.8,141.0
8,8000_0.001_0.9,141.0
9,13000_0.005_0.7,128.0


## Found the best params, now let's use them...

In [26]:
params_df.tfidf_weight.sort_values(ascending=False).head(10)

params_df.query('max_df == .9 & min_df == 0.001 & max_features == 21000 & tfidf_weight > 0.5')

params_df.url.unique().shape

(34,)

In [14]:
top_params = results.iloc[0].params
print(f"Best hyperparams: {top_params}")

Best hyperparams: 21000_0.001_0.7


In [15]:
params_df[params_df.params == top_params].filter(items=['article_id', 'url', 'params', 'keyword', 'tfidf_weight', 'rank'])

Unnamed: 0,article_id,url,params,keyword,tfidf_weight,rank
7580,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,21000_0.001_0.7,red bull,0.320,1
7581,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,21000_0.001_0.7,bull,0.302,2
7582,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,21000_0.001_0.7,red,0.248,3
7583,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,21000_0.001_0.7,foam,0.225,4
7584,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,21000_0.001_0.7,jamie,0.185,5
...,...,...,...,...,...,...
8218,63575,https://www.surfline.com/surf-news/wave-day-ma...,21000_0.001_0.7,late drop,0.118,16
8219,63575,https://www.surfline.com/surf-news/wave-day-ma...,21000_0.001_0.7,view live,0.117,17
8220,63575,https://www.surfline.com/surf-news/wave-day-ma...,21000_0.001_0.7,clean conditions,0.116,18
8221,63575,https://www.surfline.com/surf-news/wave-day-ma...,21000_0.001_0.7,usual suspects,0.115,19


In [30]:
max_features, min_df, max_df = map(float, top_params.split('_'))

In [43]:
%%time

cv = CountVectorizer(stop_words=stopword_set, ngram_range=(1, 3), max_df=max_df, min_df=min_df, max_features=features)
word_count_vector = cv.fit_transform(docs)

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

# save the CountVectorizer as a pickle
pickle.dump((cv, tfidf_transformer), open("count_vectorizer.pickle", "wb"))

CPU times: user 2min 43s, sys: 10.9 s, total: 2min 54s
Wall time: 2min 58s


In [39]:
results = process_results(df_idf, cv, tfidf_transformer, max_df, min_df, features)
results

Unnamed: 0,keyword,tfidf_weight,article_id,url,max_df,min_df,max_features,rank
0,red bull,0.320,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.001,21000,1
1,bull,0.302,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.001,21000,2
2,red,0.248,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.001,21000,3
3,foam,0.225,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.001,21000,4
4,jamie,0.185,63577,https://www.surfline.com/surf-news/j-o-b-soft-...,0.7,0.001,21000,5
...,...,...,...,...,...,...,...,...
1161585,late drop,0.118,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.7,0.001,21000,16
1161586,view live,0.117,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.7,0.001,21000,17
1161587,clean conditions,0.116,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.7,0.001,21000,18
1161588,usual suspects,0.115,63575,https://www.surfline.com/surf-news/wave-day-ma...,0.7,0.001,21000,19


In [40]:
results.tfidf_weight.max()

1.0

### Test the pickled vectorizer...

In [35]:
# load pickle
vectorizer = pickle.load(open("count_vectorizer.pickel", "rb"))

test_article_id = '1234567'
test_url = 'https://yewreview.com/blog/this-is-a-test-url'
test_text = "Laird Hamilton is a badass dude. When he's not surfing the biggest waves on his foil he's making dope powder for my coffee. I love that it tastes like coconut and it's so good"
row = [(test_article_id, test_url, test_text)]

test_df = pd.DataFrame(row, columns=['article_id', 'url', 'text'])
test_df

test_vector = vectorizer.transform(test_text)
test_vector

transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(test_vector)

test_results = process_results(test_df, vectorizer, tfidf_transformer, max_df, min_df, features)
test_results

ValueError: Iterable over raw text documents expected, string object received.