In [109]:
import pandas as pd
import requests
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [110]:
metadata = pd.read_csv('metadata.tsv', sep = '\t')

In [111]:
metadf = metadata.loc[:,['episode_uri','show_name', 'show_description', 'episode_name', 'episode_description']]

In [112]:
# generate docno column and convert all to strings for PyTerrier compatibility
metadf['docno'] = metadf.index
metadf = metadf.astype(str)
metadf.head()

Unnamed: 0,episode_uri,show_name,show_description,episode_name,episode_description,docno
0,spotify:episode:000A9sRBYdVh66csG2qEdj,Kream in your Koffee,A 20-something blunt female takes on the world...,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,0
1,spotify:episode:000HP8n3hNIfglT2wSI2cA,Morning Cup Of Murder,Ever wonder what murder took place on today in...,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma...",1
2,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,2
3,spotify:episode:001i89SvIQgDuuyC53hfBm,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,3
4,spotify:episode:0025RWNwe2lnp6HcnfzwzG,FBoL,"The comedy podcast about toxic characters, wri...","The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,4


In [113]:
# example metadata for presentation
metadf[['show_name', 'episode_name', 'episode_description']]

Unnamed: 0,show_name,episode_name,episode_description
0,Kream in your Koffee,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...
1,Morning Cup Of Murder,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma..."
2,Inside The 18 : A Podcast for Goalkeepers by G...,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...
3,Arrowhead Live!,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...
4,FBoL,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...
...,...,...,...
105355,The Top 10,The Top 10 - Re-List - Steve Martin Moves,Thanks to our patreon members for their suppor...
105356,Let's Grab Coffee Podcast,Let's Grab Coffee E45 with Ross Paquette | Gro...,Ross founded Maropost in 2011 as a customer-ce...
105357,Coach Corey Wayne,Maybe She Is Just Testing Me?,How to know if your woman is maybe just testin...
105358,The Cricket Podcast,Ep 16: England In Danger,"In Episode 16, the boys evaluate England's per..."


<div style="background-color: #090e60; padding: 10px; border: 8px solid  white;">
    <p style="text-align: center; font-weight: bold; color: white; font-size: 30px; margin: 0; position: relative; top: 50%; transform: translateY(-50%); font-family: 'Italianno', cursive;">&ndash;&ndash;&ndash;&nbsp;Queries&nbsp;&ndash;&ndash;&ndash;
    </p> 
</div>

In [114]:
urls = [
    "https://trecpodcasts.github.io/resources/podcasts_2020_topics_train.xml",
    "https://trecpodcasts.github.io/resources/podcasts_2020_topics_test.xml"
]

tables = []
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "xml")
    rows = soup.find_all("topic")

    # Create dict with column headings and empty lists
    table_dict = {"num": [], "query": [], "type": [], "description": []}

    # For every row in the table, find each cell element and add it to the list
    for row in rows:
        table_dict = {
            col: table_dict[col] + [row.find(col).text] for col in table_dict
        }

    table = pd.DataFrame(table_dict)
    table = table.applymap(lambda x: x[0] if isinstance(x, list) else x)
    tables.append(table)

query = pd.concat(tables, ignore_index=True)
query.head()



Unnamed: 0,num,query,type,description
0,1,coronavirus spread,topical,What were people saying about the spread of th...
1,2,greta thunberg cross atlantic,topical,What were people saying about Greta Thunberg’s...
2,3,black hole image,topical,In May 2019 astronomers released the first-eve...
3,4,story about riding a bird,refinding,I remember hearing a podcast that had a story ...
4,5,daniel ek interview,known item,Someone told me about a podcast interview with...


In [115]:
query = query[~query['num'].isin(['47', '50'])].reset_index(drop=True)
query.head()


Unnamed: 0,num,query,type,description
0,1,coronavirus spread,topical,What were people saying about the spread of th...
1,2,greta thunberg cross atlantic,topical,What were people saying about Greta Thunberg’s...
2,3,black hole image,topical,In May 2019 astronomers released the first-eve...
3,4,story about riding a bird,refinding,I remember hearing a podcast that had a story ...
4,5,daniel ek interview,known item,Someone told me about a podcast interview with...


In [116]:
pd.options.display.max_colwidth = 200
short_query = query.loc[:, ['num', 'query']].rename(columns={'num': 'qid','query': 'query'})
full_query = query.loc[:, ['num', 'description']].rename(columns={'num': 'qid','description': 'query'})

In [117]:
def remove_non_alphanumeric(row):
  return re.sub(r'[^a-zA-Z0-9 ]+', '', row.query)

full_query['query'] = full_query.apply(remove_non_alphanumeric, axis=1)
full_query.head()

Unnamed: 0,qid,query
0,1,What were people saying about the spread of the novel coronavirus NCOV19 in Wuhan at the end of 2019
1,2,What were people saying about Greta Thunbergs sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change
2,3,In May 2019 astronomers released the firstever picture of a black hole I would like to hear some conversations and educational discussion about the science of astronomy black holes and of the pict...
3,4,I remember hearing a podcast that had a story about a kid riding some kind of bird I want to find it again
4,5,Someone told me about a podcast interview with Daniel Ek CEO of Spotify about the founding and early days of Spotify I would like to find the show and episode that contains that interview Other in...


<div style="background-color: #090e60; padding: 10px; border: 8px solid  white;">
    <p style="text-align: center; font-weight: bold; color: white; font-size: 30px; margin: 0; position: relative; top: 50%; transform: translateY(-50%); font-family: 'Italianno', cursive;">&ndash;&ndash;&ndash;&nbsp;Qrels&nbsp;&ndash;&ndash;&ndash;
    </p> 
</div>




In [118]:
url_train = "https://trecpodcasts.github.io/resources/2020_train_qrels.list"
url_test ="https://trecpodcasts.github.io/resources/2020_test_qrels.list"

In [119]:
response = requests.get(url_train)
lines = response.text.split("\n")

data = []
for line in lines:
    if line:
        query, _, episode_uri, relevance = line.split("\t")
        data.append({"query": query, "episode_uri": episode_uri, "relevance": relevance})

qrel = pd.DataFrame(data)
pd.options.display.max_colwidth = 50
print(qrel.shape)
qrel


(609, 3)


Unnamed: 0,query,episode_uri,relevance
0,1,spotify:episode:0E2nqCXMkS218SE72APmNr_240.0,2
1,1,spotify:episode:0E2nqCXMkS218SE72APmNr_300.0,2
2,1,spotify:episode:0E2nqCXMkS218SE72APmNr_360.0,2
3,1,spotify:episode:0Th494DvnO5dU8vTi3QHm2_120.0,1
4,1,spotify:episode:199bOiXL0l4YsRaSNDNXvP_1200.0,2
...,...,...,...
604,8,spotify:episode:7MnlmF9PV3WUOPI8IjGd0y_720.0,0
605,8,spotify:episode:7MnlmF9PV3WUOPI8IjGd0y_780.0,0
606,8,spotify:episode:7wJUUYWcHm7YwEoCmTGFoD_240.0,1
607,8,spotify:episode:7wJUUYWcHm7YwEoCmTGFoD_300.0,1


In [120]:
# add Podcast Track 2020 test set QRels
url_test = "https://trecpodcasts.github.io/resources/2020_test_qrels.list"
response_test = requests.get(url_test)

lines_test = response_test.text.split("\n")
data_test = []
for line in lines_test:
    if line:
        query, _, episode_uri, relevance = line.split()
        data_test.append({"query": query, "episode_uri": episode_uri, "relevance": relevance})

qrel_test = pd.DataFrame(data_test)
qrel = qrel.append(qrel_test).reset_index(drop=True)
print(qrel.shape)
qrel

(10035, 3)


  qrel = qrel.append(qrel_test).reset_index(drop=True)


Unnamed: 0,query,episode_uri,relevance
0,1,spotify:episode:0E2nqCXMkS218SE72APmNr_240.0,2
1,1,spotify:episode:0E2nqCXMkS218SE72APmNr_300.0,2
2,1,spotify:episode:0E2nqCXMkS218SE72APmNr_360.0,2
3,1,spotify:episode:0Th494DvnO5dU8vTi3QHm2_120.0,1
4,1,spotify:episode:199bOiXL0l4YsRaSNDNXvP_1200.0,2
...,...,...,...
10030,32,spotify:episode:7lrWzeepJQwhr0jyuVEk9m_300.0,0
10031,32,spotify:episode:7lrWzeepJQwhr0jyuVEk9m_600.0,0
10032,32,spotify:episode:7ok6wB1WviyXvxba0wdb5q_720.0,0
10033,32,spotify:episode:7ok6wB1WviyXvxba0wdb5q_780.0,0


In [121]:
qrel['episode_uri'] = qrel['episode_uri'].apply(lambda uri: uri.split('_')[0])
qrel = qrel.groupby(['query', 'episode_uri']).max().reset_index()
qrel

Unnamed: 0,query,episode_uri,relevance
0,1,spotify:episode:0E2nqCXMkS218SE72APmNr,2
1,1,spotify:episode:0Th494DvnO5dU8vTi3QHm2,1
2,1,spotify:episode:199bOiXL0l4YsRaSNDNXvP,2
3,1,spotify:episode:1ZA1QTtylexrVt75xiprNH,2
4,1,spotify:episode:1o0HCHjNtRWdEWRTWJhzyC,0
...,...,...,...
5825,9,spotify:episode:6v0auT8BbeXzMTmg9FjyDB,1
5826,9,spotify:episode:7EKJswck8jP4tXWHXVJCdO,0
5827,9,spotify:episode:7N0z3FpvsWWTCSgARuCd8C,0
5828,9,spotify:episode:7mIga7ujiMVbbEN5bBQPUX,0


In [122]:
# Find all episodes present in qrels
in_qrels = metadf.loc[metadf['episode_uri'].isin(qrel['episode_uri'].unique())]

# Check for NaN and empty string entries
null_or_empty = in_qrels[in_qrels['episode_description'].isna() | (in_qrels['episode_description'] == '')]

# Check for one-word descriptions
def one_word(row):
    return re.search(r'.*\s', row.episode_description)

one_word_descr = list(in_qrels.index[in_qrels.apply(one_word, axis=1).isna()])
one_word_descr

[3972, 101227]

In [123]:
# Find all episodes present in qrels
qrel_uris = set(qrel['episode_uri'].unique())
in_qrels = metadf[metadf['episode_uri'].isin(qrel_uris)]

# Check for NaN and empty string entries
null_or_empty = in_qrels[in_qrels['episode_description'].isna() | (in_qrels['episode_description'] == '')]

# Check for one-word descriptions
one_word_descr = in_qrels[in_qrels['episode_description'].str.count(' ') == 0]

# Remove QRels with one-word episode descriptions
qrel = qrel[~qrel['episode_uri'].isin(one_word_descr['episode_uri'])].reset_index(drop=True)

# Find indices of NaN and empty string entries
null_idxs = in_qrels[in_qrels['episode_description'].isna()].index
empty_idxs = in_qrels[in_qrels['episode_description'] == ''].index

# Check for one-word descriptions using a lambda function
one_word_descr = in_qrels[in_qrels.apply(lambda x: len(x['episode_description'].split()) == 1, axis=1)]
one_word_descr

Unnamed: 0,episode_uri,show_name,show_description,episode_name,episode_description,docno
3972,spotify:episode:0HxLcyXXoVhVYUD0aIFYQj,The Rewind with Guy Raz,Guy Raz digs into the stories behind some of t...,Jessie J,,3972
12798,spotify:episode:0wFCxK3L96rqgN2Pn0E6i6,Bar Prep For Sufferers In UBE States,Death by UBE,Evidence,proof,12798
16014,spotify:episode:1B9lIDYV3ZBKdt4lgHRK7J,Levels podcast,No ones doin it up like is at this age Hosts ...,LEVEL 1 - Pilot,Pilot,16014
19495,spotify:episode:1REBtFG27yBSdkt5KSdjla,Revolution Ramblings,Stream of consciousness podcast by Youtube Ast...,X,🖤,19495
20508,spotify:episode:1VrQT8rB9KA0fsHCAo5kdJ,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,"Innovation, Technology & Team Work - Crucial P...",.,20508
30823,spotify:episode:2Ggbm0HKW7HJRMmAP24YSl,Zendaya Podcast,Leadership Podcast,Leadership Podcast,Zendaya,30823
33285,spotify:episode:2RWrlYKhYqSAIDM4JcC0Is,Bar Prep For Sufferers In UBE States,Death by UBE,Family Law,fam,33285
33767,spotify:episode:2Tf6Cs5DIfIYgnDp1tsOAJ,UPSC Podcasts,Podcasts useful for UPSC aspirants! Mainly dis...,Bilateral Relations between India and China,.,33767
35559,spotify:episode:2bjO0LTkno3kTSa2nFR3dm,Excuse My French,A French Woman from Paris living in LA sharing...,"France, the holidays and wholefood",.,35559
44766,spotify:episode:3HrYAqEyk2gZHMa8CRA2Fp,Eating Disorders Anonymous (EDA) Speaker Feed,"Hear stories of experience, strength, and hope...",Steps Workshop - Step 12 - Trish,12.14.19,44766


In [124]:
# Remove QRels with one-word episode descriptions using boolean indexing
qrel = qrel[~qrel['episode_uri'].isin(one_word_descr['episode_uri'])].reset_index(drop=True)
qrel.shape

(5738, 3)

In [125]:
# create dict {episode_uri: doc}
docno_dict = dict(metadf[['episode_uri', 'docno']].values)

# insert docno for each episode_uri in qrels using dict
qrel['docno'] = qrel['episode_uri'].map(docno_dict)

# rename columns and change data types to make qrels compatible with PyTerrier
qrel = qrel.rename(columns={'query': 'qid', 'relevance': 'label'})
qrel['qid'] = qrel['qid'].astype(str)
qrel['label'] = qrel['label'].astype(int)  # Convert the 'label' column to integers
qrel = qrel[['qid', 'docno', 'label']]

# display qrels
qrel


Unnamed: 0,qid,docno,label
0,1,3096,2
1,1,6496,1
2,1,15551,2
3,1,21291,2
4,1,24523,0
...,...,...,...
5733,9,93648,1
5734,9,97871,0
5735,9,99768,0
5736,9,102459,0


<div style="background-color: #090e60; padding: 10px; border: 8px solid  white;">
    <p style="text-align: center; font-weight: bold; color: white; font-size: 30px; margin: 0; position: relative; top: 50%; transform: translateY(-50%); font-family: 'Italianno', cursive;">&ndash;&ndash;&ndash;&nbsp;Demo&nbsp;&ndash;&ndash;&ndash;
    </p> 
</div>




In [126]:
metadf.head()

Unnamed: 0,episode_uri,show_name,show_description,episode_name,episode_description,docno
0,spotify:episode:000A9sRBYdVh66csG2qEdj,Kream in your Koffee,A 20-something blunt female takes on the world...,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,0
1,spotify:episode:000HP8n3hNIfglT2wSI2cA,Morning Cup Of Murder,Ever wonder what murder took place on today in...,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma...",1
2,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,2
3,spotify:episode:001i89SvIQgDuuyC53hfBm,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,3
4,spotify:episode:0025RWNwe2lnp6HcnfzwzG,FBoL,"The comedy podcast about toxic characters, wri...","The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,4


In [127]:
short_query.head()

Unnamed: 0,qid,query
0,1,coronavirus spread
1,2,greta thunberg cross atlantic
2,3,black hole image
3,4,story about riding a bird
4,5,daniel ek interview


In [128]:
full_query.head()

Unnamed: 0,qid,query
0,1,What were people saying about the spread of th...
1,2,What were people saying about Greta Thunbergs ...
2,3,In May 2019 astronomers released the firstever...
3,4,I remember hearing a podcast that had a story ...
4,5,Someone told me about a podcast interview with...


In [129]:
qrel.head()

Unnamed: 0,qid,docno,label
0,1,3096,2
1,1,6496,1
2,1,15551,2
3,1,21291,2
4,1,24523,0


In [130]:
# Installing PyTerrier
!pip install python-terrier

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [131]:
import pyterrier as pt
import numpy as np
import pandas as pd

def initialize_pt():
    if not pt.started():
        pt.init()

def remove_index_folder():
    !rm -rf ./pd_index

def create_index(metadf):
    pd_indexer = pt.DFIndexer("./pd_index")
    index = pd_indexer.index(metadf["episode_description"], metadf["docno"])
    return index

def get_index_file(index):
    return pt.IndexFactory.of(index)

def print_collection_stats(index_file):
    print(index_file.getCollectionStatistics())

initialize_pt()
remove_index_folder()
index = create_index(metadf)
index_file = get_index_file(index)
print_collection_stats(index_file)




  for column, value in meta_column[1].iteritems():


16:42:42.356 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 255 empty documents
Number of documents: 105360
Number of terms: 131456
Number of postings: 4124038
Number of fields: 0
Number of tokens: 5453411
Field names: []
Positions:   false



In [132]:
def get_example_index(docid, index_file):
    di = index_file.getDirectIndex()
    doi = index_file.getDocumentIndex()
    lex = index_file.getLexicon()
    example_index = pd.DataFrame(columns=['Term', 'Frequency'])
    
    for posting in di.getPostings(doi.getDocumentEntry(docid)):
        termid = posting.getId()
        lee = lex.getLexiconEntry(termid)
        term_freq = pd.DataFrame([[str(lee.getKey()), str(posting.getFrequency())]], columns=['Term', 'Frequency'])
        example_index = example_index.append(term_freq)

    example_index = example_index.reset_index(drop=True)
    return example_index

In [133]:
def print_episode_info(docid, metadf):
    show_name = metadf.loc[metadf.docno == str(docid)].show_name.values[0]
    episode_name = metadf.loc[metadf.docno == str(docid)].episode_name.values[0]
    episode_description = metadf.loc[metadf.docno == str(docid)].episode_description.values[0]
    
    print(f'show name: {show_name}')
    print(f'episode name: {episode_name}')
    print(f'episode description: {episode_description}')

In [134]:
docid = 6
example_index = get_example_index(docid, index_file)
print_episode_info(docid, metadf)

bm25 = pt.BatchRetrieve(index, wmodel='BM25')
pl2 = pt.BatchRetrieve(index, wmodel='PL2')

show name: The Feminization Boudoir
episode name: The Sissy's Mentor (Part 5)
episode description: Miss Jenn Davis reads the final part of The Sissy's Mentor.  It's the story of a young inexperienced college freshman and the dominant woman who takes him under her wing or is that wig?   You can find a lot more great erotic content from Jenn at MissJennDavis.com. Remember that this podcast plays clips (Usually 40%-50%) from audios.  At least half the show is missing and in that half are the very sexiest parts. Please support Candy Apple Press at our clipsites: IWantClips Clips4Sale and buy our books: Amazon.com You can also donate to the Podcast directly at Anchor.fm.  Wherever you listen, please give us good feedback. Thanks, Kylie  ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app  Support this podcast: https://anchor.fm/kylie-gable/support


  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)
  example_index = example_index.append(term_freq)


In [142]:
models_list = [bm25, pl2]

exp_df_short = pt.Experiment(
    models_list,
    short_query,
    qrel,
    eval_metrics=eval_metrics_list,
    names=['BM25', 'PL2']
)
print(exp_df_short.sort_values(by=['map'], ascending=False)[:5])

   name       map      ndcg  recip_rank       R@5      R@10      R@15  \
0  BM25  0.127050  0.290382    0.509473  0.135931  0.163656  0.181625   
1   PL2  0.124207  0.286858    0.506501  0.131175  0.158539  0.176847   

       R@20      R@30     R@100     R@200     R@500    R@1000        mrt  
0  0.194193  0.212797  0.284753  0.320845  0.359622  0.376515  27.992090  
1  0.184776  0.201197  0.282072  0.319046  0.357215  0.376516  32.950439  


In [143]:
models_list = [bm25, pl2]

exp_df_short = pt.Experiment(
    models_list,
    full_query,
    qrel,
    eval_metrics=eval_metrics_list,
    names=['BM25', 'PL2']
)
print(exp_df_short.sort_values(by=['map'], ascending=False)[:5])

   name       map      ndcg  recip_rank       R@5      R@10      R@15  \
0  BM25  0.121408  0.282391    0.478899  0.120789  0.140971  0.176937   
1   PL2  0.113027  0.270138    0.460196  0.111716  0.129386  0.148549   

       R@20      R@30     R@100     R@200     R@500    R@1000        mrt  
0  0.185512  0.202482  0.267562  0.303330  0.348212  0.395465  35.987806  
1  0.165298  0.196600  0.254190  0.286587  0.342082  0.390185  40.520484  
