In [89]:
import pandas as pd
import numpy as np
import string
import re
from sklearn.feature_extraction.text import CountVectorizer

from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer  # lemmatization
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


In [4]:
data = pd.read_csv('metadata.tsv', sep = '\t')

In [10]:
data.head()

Unnamed: 0,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix
0,spotify:show:2NYtxEZyYelR6RMKmjfPLB,Kream in your Koffee,A 20-something blunt female takes on the world...,Katie Houle,['en'],https://anchor.fm/s/11b84b68/podcast/rss,spotify:episode:000A9sRBYdVh66csG2qEdj,1: It’s Christmas Time!,On the first ever episode of Kream in your Kof...,12.700133,show_2NYtxEZyYelR6RMKmjfPLB,000A9sRBYdVh66csG2qEdj
1,spotify:show:15iWCbU7QoO23EndPEO6aN,Morning Cup Of Murder,Ever wonder what murder took place on today in...,Morning Cup Of Murder,['en'],https://anchor.fm/s/b07181c/podcast/rss,spotify:episode:000HP8n3hNIfglT2wSI2cA,The Goleta Postal Facility shootings- January ...,"See something, say something. It’s a mantra ma...",6.019383,show_15iWCbU7QoO23EndPEO6aN,000HP8n3hNIfglT2wSI2cA
2,spotify:show:6vZRgUFTYwbAA79UNCADr4,Inside The 18 : A Podcast for Goalkeepers by G...,Inside the 18 is your source for all things Go...,Inside the 18 GK Media,['en'],https://anchor.fm/s/81a072c/podcast/rss,spotify:episode:001UfOruzkA3Bn1SPjcdfa,Ep.36 - Incorporating a Singular Goalkeeping C...,Today’s episode is a sit down Michael and Omar...,43.616333,show_6vZRgUFTYwbAA79UNCADr4,001UfOruzkA3Bn1SPjcdfa
3,spotify:show:5BvKEjaMSuvUsGROGi2S7s,Arrowhead Live!,Your favorite podcast for everything @Chiefs! ...,Arrowhead Live!,['en-US'],https://anchor.fm/s/917dba4/podcast/rss,spotify:episode:001i89SvIQgDuuyC53hfBm,Episode 1: Arrowhead Live! Debut,Join us as we take a look at all current Chief...,58.1892,show_5BvKEjaMSuvUsGROGi2S7s,001i89SvIQgDuuyC53hfBm
4,spotify:show:7w3h3umpH74veEJcbE6xf4,FBoL,"The comedy podcast about toxic characters, wri...",Emily Edwards,['en'],https://www.fuckboisoflit.com/episodes?format=rss,spotify:episode:0025RWNwe2lnp6HcnfzwzG,"The Lion, The Witch, And The Wardrobe - Ashley...",The modern morality tail of how to stay good f...,51.78205,show_7w3h3umpH74veEJcbE6xf4,0025RWNwe2lnp6HcnfzwzG


In [59]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# pre-processing
def pre_process(text):
    punctuation_marks = string.punctuation.replace("'", "")
    
    #converts to lower case
    if preprocessing_switches['convert_to_lowercase']:    
        text = text.lower()
    
    # separate punctuation from words - preserving apostrophe
    if preprocessing_switches['separate_out_punctuation']:
        for c in punctuation_marks:
            text = text.replace(c, ' ' + c + ' ')
    
    #converting numbers to digits
    if preprocessing_switches["convert_number_words_to_digits"]:
        text = text2int(text)   
    
    #removing numbers
    if preprocessing_switches["convert_numbers"]:
        text = re.sub('\d+', 'NUMBER',text)
        text = text.replace('NUMBER ', '')
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords
    if preprocessing_switches['separate_out_punctuation']:    
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
    
    # Remove punctuation
    if preprocessing_switches['remove_punctuation']:    
        tokens = [token for token in tokens if token not in punctuation_marks]
    
    # Lemmatize the tokens
    if preprocessing_switches['apply_lemmatization']:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
    # Stem the tokens
    if preprocessing_switches['stem_tokens']:
        tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(tokens)

In [37]:
preprocessing_switches = {'convert_to_lowercase': True,
                            'separate_out_punctuation': True,
                            'remove_punctuation': True,
                            'convert_number_words_to_digits': True,
                            'convert_numbers': True,
                            'remove_stopwords': True,
                            'apply_lemmatization': True,
                            'stem_tokens': False}


In [38]:
def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

In [65]:
data_subset = data.loc[:10000]
data_processed = []

for x in range(data_subset.shape[0]):
    processed = pre_process(data.loc[x, 'show_description'])
    data_processed.append(processed)


In [68]:
vectorizer = CountVectorizer()
documents_vectorized = vectorizer.fit_transform(data_processed)
vocabulary = vectorizer.get_feature_names_out()

In [160]:
dataframe = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)

# removing low word counts to improve performance - low word counts also not likely to be relevant
word_counts = dataframe.sum(axis=0)
to_remove = word_counts[word_counts <4].index
dataframe = dataframe.drop(columns = to_remove)
dataframe.shape

(10001, 7255)

In [161]:
#term frequencies
dfs = (dataframe > 0).sum(axis=0)
#idfs
N = dataframe.shape[0]
idfs = np.log10(N/dfs)
#BM25
k_1 = 1.2
b = 0.8

# The following line of code considers all words in each document:
dls_all = [len(d.split(' ')) for d in data_processed] # vector

# But we're not intrested in stop words, therefore, let's do the following: 
dls = dataframe.sum(axis=1).tolist()  # document lengths in words
avgdl = np.mean(dls) # avarage document length

numerator = np.array((k_1 + 1) * dataframe)
denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N, 1) + \
                       np.array(dataframe)

BM25_tf = numerator / denominator

idfs = np.array(idfs)  # inverse document frequencies

BM25_score = idfs * BM25_tf

In [162]:
bm25_idf = pd.DataFrame(BM25_score, columns=dataframe.columns)
bm25_idf['christmas'].sort_values(ascending = False)

7392     4.805475
2077     3.856923
6765     3.856923
9530     3.856923
2084     3.720322
           ...   
3335     0.000000
3336     0.000000
3337     0.000000
3338     0.000000
10000    0.000000
Name: christmas, Length: 10001, dtype: float64

In [158]:
data.loc[7392]

show_uri                                 spotify:show:2TPE8nsTVHyRkaVmQTbKbW
show_name                                               61 Days of Christmas
show_description           Hosts Abby and Kelsy watch the 40 new Hallmark...
publisher                                      61 Days of Hallmark Christmas
language                                                              ['en']
rss_link                            https://anchor.fm/s/1004fcbc/podcast/rss
episode_uri                           spotify:episode:0XgrKcGfkaf3cwGxkwKe5e
episode_name                 Christmas Wishes & Mistletoe Kisses Snoozefest 
episode_description        Hosts Abby and Kelsy recount the exhausting fi...
duration                                                           57.212417
show_filename_prefix                             show_2TPE8nsTVHyRkaVmQTbKbW
episode_filename_prefix                               0XgrKcGfkaf3cwGxkwKe5e
Name: 7392, dtype: object

In [163]:
query = 'christmas'
columns = ['show_name', 'show_description', 'publisher']

query_words = pre_process(query).split(' ')
top_results = bm25_idf[query_words].sum(axis = 1).sort_values(ascending = False).index[:5].tolist()
data.loc[top_results, columns]

Unnamed: 0,show_name,show_description,publisher
7392,61 Days of Christmas,Hosts Abby and Kelsy watch the 40 new Hallmark...,61 Days of Hallmark Christmas
2077,A Cup Of Cheer: A Seasonal Holiday Podcast,"We don’t know if there will be snow, so have a...",DeeJ
6765,A Cup Of Cheer: A Seasonal Holiday Podcast,"We don’t know if there will be snow, so have a...",DeeJ
9530,A Cup Of Cheer: A Seasonal Holiday Podcast,"We don’t know if there will be snow, so have a...",DeeJ
2084,MC Crue Podcast,"On this podcast, we'll be talking about anythi...",Wolfpac Media


In [166]:
# from sqlalchemy import create_engine
# engine = create_engine('sqlite:///IRDB', echo=False)
# bm25_idf.to_sql('users', con=engine)

(bm25_idf.memory_usage(deep = True).sum()) / (1024 * 1024)
bm25_idf.to_csv('Web Page/BM25_data.csv')

In [164]:
data.loc[2084, 'show_description']

"On this podcast, we'll be talking about anything and everything, especially what makes a movie a Christmas movie!   You can follow the guys on Instagram @mc.crue.podcast or e-mail us segment ideas at mccruepod@gmail.com  Cheers! "

In [173]:
data['show_name'].to_list()

['Kream in your Koffee',
 'Morning Cup Of Murder',
 'Inside The 18 : A Podcast for Goalkeepers by Goalkeepers',
 'Arrowhead Live!',
 'FBoL ',
 'UPSC Podcasts',
 'The Feminization Boudoir',
 'Chastity and The City',
 'They Had to Go Out',
 'The Good Sign ',
 'Behaviorbabe ',
 'In the Pocket Podcast',
 'The Willing Equine',
 'Revise - GCSE Biology Revision',
 'The Pole Dance Podcast',
 'eCommerce Lifestyle',
 'MOMHOOD',
 'This Particular Album is Very, Very Important to Me',
 'The 3-0 Take: a Dirt to Diamonds Baseball Podcast',
 'IMPACT RADIO',
 'The Cozy Minimalism Podcast',
 'Premier League Preview Podcast',
 'Fortunately Equipped with Arta Kublacova',
 'WHAT ABOUT BIGFOOT?',
 'The Survivor After Show Podcast',
 'G.A.R Capital Podcast ',
 'Athletic Truth Group Podcast',
 'Simple Farmhouse Life',
 'Bro Knights Podcast',
 'The Sports Corner',
 'Revolution Radio Podcast',
 'Daily Fortnite',
 'Chompers',
 'Big D Beatdown - An unbiased Cowboys football podcast',
 'How to Trade Stocks and Op