In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
#Scrape RSS feed for local Huntsville events
url = 'https://www.huntsville.org/event/rss/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'}
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

In [3]:
#Here's a snippet of what the rss feed looks like:
soup

<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#" xmlns:georss="http://www.georss.org/georss" xmlns:media="http://search.yahoo.com/mrss/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:wfw="http://wellformedweb.org/CommentAPI/">
<channel>
<title>Events Calendar</title>
<atom:link href="http://www.huntsville.org/event/rss/" rel="self" type="application/rss+xml"></atom:link>
<link/>http://www.huntsville.org/event/rss/
		<description>Events going on in your town</description>
<pubdate>Tue, 27 Jul 2021 18:49:45 +0000</pubdate>
<item>
<title>Ballpark Tours</title>
<link/>https://www.huntsville.org/event/ballpark-tours/44182/
				
					<category><![CDATA[ Family Friendly ]]></category>
<category><![CDATA[ Outdoors ]]></catego

In [4]:
#Grab built in titles, categories, and the link to the offical website
title = []
cat = []
links = []
for item in soup.find_all('item'):
    title.append(item.title.text)
    cat.append(item('category', text=True))
    links.append(item.guid.text)

print("Event Title:", title[0],'\n', "Event Category: ", cat[0])

Event Title: Ballpark Tours 
 Event Category:  [<category><![CDATA[ Family Friendly ]]></category>, <category><![CDATA[ Outdoors ]]></category>, <category><![CDATA[ Sports ]]></category>, <category><![CDATA[ Tours ]]></category>]


In [5]:
#I'll be grabbing the description and running NLP on it to get keywords later.
#So ignore this for now.
descript = []
for item in soup.find_all('item'):
    descript.append(item.description.text.replace('\t','').replace('\n','').replace('\r','').replace('<p>','').replace('</p>',''))

In [6]:
#Create a dataframe
data = pd.DataFrame(list(zip(title, cat, links)),
              columns=['EventTitle','EventCategories','EventLink'])

In [7]:
data.head()

Unnamed: 0,EventTitle,EventCategories,EventLink
0,Ballpark Tours,"[[ Family Friendly ], [ Outdoors ], [ Sports ]...",https://www.huntsville.org/event/ballpark-tour...
1,COLUMBUS SHIPS PINTA AND NINA TOURS,"[[ Family Friendly ], [ Historical ], [ Travel...",https://www.huntsville.org/event/columbus-ship...
2,Encounters: Greely Myatt,"[[ Art ], [ Historical ], [ Traveling Exhibits ]]",https://www.huntsville.org/event/encounters%3a...
3,GEOQUEST!,"[[ Family Friendly ], [ Historical ], [ Outdoo...",https://www.huntsville.org/event/geoquest!/44507/
4,Jack Mitchell: Artists,"[[ Art ], [ Traveling Exhibits ]]",https://www.huntsville.org/event/jack-mitchell...


In [8]:
data = data.astype(str)

In [9]:
data['EventCategories'] = data['EventCategories'].replace(['\[','\]','\<','\>','\!','category','CDATA','\/','\:'],'', regex=True)
data['EventCategories'] = data['EventCategories'].replace(',','', regex=True)
data.head()
dataTest = data.copy()

The couple of cells below prepare the words for cosine simularity processing. This turns phrases like Ballpark City Tours into ballparkcitytours so that when we run cosine simularity on it, it will processes it as one word instead of multiple, unrelated ones.

In [10]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [11]:
features = ['EventTitle','EventCategories']
for feature in features:
    dataTest[feature] = dataTest[feature].apply(clean_data)

In [12]:
dataTest.head(3)

Unnamed: 0,EventTitle,EventCategories,EventLink
0,ballparktours,familyfriendlyoutdoorssportstours,https://www.huntsville.org/event/ballpark-tour...
1,columbusshipspintaandninatours,familyfriendlyhistoricaltravelingexhibits,https://www.huntsville.org/event/columbus-ship...
2,encounters:greelymyatt,arthistoricaltravelingexhibits,https://www.huntsville.org/event/encounters%3a...


These next couple of cells create one column with each row containing the title and categories.

In [13]:
def create_soup(x):
    return ''.join(x['EventTitle']) + ' ' + ''.join(x['EventCategories'])

In [14]:
dataTest['soup'] = data.apply(create_soup, axis=1)
dataTest[['soup']].head()

Unnamed: 0,soup
0,Ballpark Tours Family Friendly Outdoors S...
1,COLUMBUS SHIPS PINTA AND NINA TOURS Family Fr...
2,Encounters: Greely Myatt Art Historical T...
3,GEOQUEST! Family Friendly Historical Outd...
4,Jack Mitchell: Artists Art Traveling Exhibits


These next few cells vectorize the text and then grab the cosine simularity between items, which should predict how similar they are.

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(dataTest['soup'])

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [17]:
indices = pd.Series(data.index, index=data['EventTitle'])

In [18]:
def get_recommendations(title, cosine_sim2=cosine_sim2):
    # Get the index of the event that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all events with that event
    sim_scores = list(enumerate(cosine_sim2[idx]))

    # Sort the events based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar events
    sim_scores = sim_scores[1:11]

    # Get the event indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar events
    return data[['EventTitle','EventCategories','EventLink']].iloc[movie_indices]

Now, if you input one event, it will return the top 10 most similar events.

In [19]:
get_recommendations('Ballpark Tours', cosine_sim2)

Unnamed: 0,EventTitle,EventCategories,EventLink
1,COLUMBUS SHIPS PINTA AND NINA TOURS,Family Friendly Historical Traveling Exhi...,https://www.huntsville.org/event/columbus-ship...
10,Rocket City Trash Pandas v. Birmingham Barons,Family Friendly Outdoors Sports,https://www.huntsville.org/event/rocket-city-t...
29,Downtown Walking Tours,Historical Huntsville History Month Tours,https://www.huntsville.org/event/downtown-walk...
8,Peak Butterfly Season,Family Friendly Outdoors,https://www.huntsville.org/event/peak-butterfl...
20,Night Blooms,Art Family Friendly Outdoors Seasonal -...,https://www.huntsville.org/event/night-blooms/...
19,Movies in the Park,Family Friendly Free Outdoors Seasonal ...,https://www.huntsville.org/event/movies-in-the...
22,$5 after 5 at EarlyWorks,Family Friendly Historical S.T.E.M.,https://www.huntsville.org/event/%245-after-5-...
13,Tweetsville,Family Friendly Outdoors Seasonal - Sprin...,https://www.huntsville.org/event/tweetsville/4...
3,GEOQUEST!,Family Friendly Historical Outdoors Sea...,https://www.huntsville.org/event/geoquest!/44507/
14,"Uncaged: Birds, Nature & You",Family Friendly Outdoors Seasonal - Fall ...,https://www.huntsville.org/event/uncaged%3a-bi...
