# importing data from database

In [1]:
import numpy as np
import pandas as pd
from SportsExperiencePlatform.data import connect_db, get_data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import save_npz, load_npz

In [8]:
conn = connect_db()

if conn:
    users_df, events_df, ahoy_events_df = get_data(conn)



In [3]:
users_df.shape, events_df.shape, ahoy_events_df.shape

((9, 17), (904, 17), (53, 6))

In [15]:
ahoy_events_df[ahoy_events_df['user_id'] == 70].properties.values[0]

{'user': 70, 'offer': 4393}

# advanced cleaning of data

In [16]:
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
from googletrans import Translator

In [17]:
# load spacy
nlp = spacy.load('en_core_web_sm')

def clean_string(text,stem="None"):

    final_string = ""

    # make lower
    text = text.lower()
    
    # remove http
    text = re.sub(r"http.*\.[a-z]{2,3}","",text)

    # remove www
    text = re.sub(r"www.*\.[a-z]{2,3}","",text)
    
    # remove line breaks
    text = re.sub(r'\n','',text)

    # remove puncuation
    translator = str.maketrans('','',string.punctuation)
    text = text.translate(translator)

    # remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    text_filtered = [word for word in text if not word in useless_words]

    # remove numbers
    text_filtered = [re.sub(r'\w*\d\w*','',w) for w in text_filtered]

    # stem or lemmatize
    if stem == 'Stem':
        stemmer = PorterStemmer() 
        text_stemmed = [stemmer.stem(y) for y in text_filtered]
    elif stem == 'Lem':
        lem = WordNetLemmatizer()
        text_stemmed = [lem.lemmatize(y) for y in text_filtered]
    elif stem == 'Spacy':
        text_filtered = nlp(' '.join(text_filtered))
        text_stemmed = [y.lemma_ for y in text_filtered]
    else:
        text_stemmed = text_filtered

    final_string = ' '.join(text_stemmed)

    return final_string
        
        

In [18]:
events_df['clean_description'] = events_df['description'].apply(lambda x: clean_string(x))
events_df['clean_title'] = events_df['title'].apply(lambda x: clean_string(x))

In [19]:
events_df['combined'] = events_df['clean_title'] + events_df['clean_description']


In [20]:
def generate_list(x):
    val = list(x.split())
    val = " ".join(val[:300])
    return val

In [21]:
events_df['combined_pruned'] = events_df['combined'].apply(lambda x: generate_list(x))

In [22]:
from googletrans import Translator
def language_translation(string):
    
    translator = Translator()
    if translator.detect(string).lang != 'en':
        result = translator.translate(string)
        return result.text
    else:
        return string

In [23]:
%%time
events_df['translated'] = events_df['combined_pruned'].apply(lambda x: language_translation(x))

CPU times: user 23.7 s, sys: 637 ms, total: 24.3 s
Wall time: 3min 44s


In [24]:
events_df['translated'][0]

'kickerworld berlinberlins modern hall indoor soccer unique blue artificial turf latest generation offers optimal conditions incomparable playing experience soccer courts extend area meters separated professional barrier system beach court outdoor area perfect thrilling game beach soccer volleyball summer long refreshing drink snack sporting activities welcome beach bar garden restaurant depending weather'

In [25]:
ahoy_events_df.user_id.unique()

array([66, 70])

## tfidf on clean data

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf = TfidfVectorizer(stop_words="english")

In [27]:
tfidf_matrix = tfidf.fit_transform(events_df["translated"])
tfidf_matrix.shape

(904, 7622)

In [28]:
cosine_sim = tfidf_matrix.dot(tfidf_matrix.T)

In [29]:
cosine_sim.shape

(904, 904)

# updating event prediction: finalizing stuff

In [30]:
def haversine_vectorized(user_lat,
                         user_lon,
                         event_lat,
                         event_lon):
    """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df
        Computes distance in kms
    """

    lat_1_rad, lon_1_rad = np.radians(user_lat), np.radians(user_lon)
    lat_2_rad, lon_2_rad = np.radians(event_lat), np.radians(event_lon)
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [31]:
def get_event_id(user_id):
    ''' this finction grabs the title of an event used by an user from ahoy_events dataset'''
    
    event_id = ahoy_events_df[ahoy_events_df['user_id']== user_id].iloc[-1].properties['offer']
    # now we get the title of the event
    #event_id = events_df[events_df['id']==event_idx].title
    
    return event_id 



In [33]:
get_event_id(66)

4555

In [34]:
import datetime
from datetime import date

def get_event_id(user_id, df = ahoy_events_df):
    ''' this finction grabs the title of an event used by an user from ahoy_events dataset'''
    
    event_id = ahoy_events_df[ahoy_events_df['user_id']== user_id].iloc[-1].properties['offer']
    # now we get the title of the event
    #event_id = events_df[events_df['id']==event_idx].title
    
    return event_id 

def get_user_loc(user_id, df = users_df):   
    ''' this finction grabs the location of an user from ahoy_events dataset'''
    
    users_idx = users_df[users_df['id']== user_id]
    # now we get the user's latitude and longitude
    
    location_dict = {'user_latitude' : float(users_idx.latitude.values), 
                     
                     'user_longitude' : float(users_idx.longitude.values)
                        }
     
    return location_dict


def content_recommender(user_id, cosine_sim = cosine_sim, df = events_df):
    '''This recommender finds out 100 closest matches with a given event'''
    
    #calling user id from ahoy_events
    
    event_id = get_event_id(user_id, df = ahoy_events_df)
    
    event_idx = events_df[events_df['id']== get_event_id(user_id)].index

    
    sim_scores = list(enumerate(cosine_sim[event_idx].todense().tolist()[0]))
    # list of set [(index, sim score), (index, sim score), ...]
    
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    # Sort the list of set by sim score, index with highest simlirarity
    # Will be at the beginning of the list
    
    sim_scores = sim_scores[1:101] # top 100 events (first one is our input events so we ignore it)
    
    sport_indices = [i[0] for i in sim_scores] # we grab the indices of those 10 events
    
    df_all_recommendations = df.iloc[sport_indices][[ 'id','title', 'latitude', 'longitude', 'offer_date']]
    
    df_date_filter = df_all_recommendations[df_all_recommendations['offer_date'] >= datetime.date.today()]
    
    #calcullation of haversine distance for users:
    user_latitude = get_user_loc(user_id, df = users_df)['user_latitude']
    user_longitude = get_user_loc(user_id, df = users_df)['user_longitude']
    
    df_date_filter['user_latitude'] = user_latitude
    df_date_filter['user_longitude'] = user_longitude
    
    df_date_filter["distance"] = haversine_vectorized(df_date_filter.latitude, df_date_filter.longitude,
                                                      df_date_filter.user_latitude, df_date_filter.user_longitude)
            
    df_location_filter = df_date_filter[df_date_filter['distance'] <= 200.0]
    
    
                                                
    return df_location_filter
#     return user_latitude

In [38]:
content_recommender(70)

Unnamed: 0,id,title,latitude,longitude,offer_date,user_latitude,user_longitude,distance
263,4395,+FREE Meditation & Yoga (vor Ort und Online)...,47.861961,12.12614,2022-06-09,48.135124,11.581981,50.615362
322,4397,+FREE Meditation & Yoga (vor Ort und Online)...,47.861961,12.12614,2022-06-16,48.135124,11.581981,50.615362
323,4398,+FREE Meditation & Yoga (vor Ort und Online)...,47.861961,12.12614,2022-06-23,48.135124,11.581981,50.615362
854,4942,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-08,48.135124,11.581981,192.137549
855,4943,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-09,48.135124,11.581981,192.137549
857,4945,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-16,48.135124,11.581981,192.137549
858,4946,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-17,48.135124,11.581981,192.137549
860,4948,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-21,48.135124,11.581981,192.137549
861,4949,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-14,48.135124,11.581981,192.137549
862,4950,Online: Meditation - Zeit für Dich,48.773899,9.16099,2022-06-20,48.135124,11.581981,192.137549
