In [None]:
#!pip install nltk

In [86]:
import numpy as np
import pandas as pd
from SportsExperiencePlatform.data import connect_db, get_data, upload_file_to_gs
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import save_npz, load_npz
import re
from langdetect import detect
from googletrans import Translator
import random
import datetime
from datetime import date
import argostranslate.package, argostranslate.translate
import json

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [27]:
conn = connect_db()

if conn:
    users_df, events_df, ahoy_events_df = get_data(conn)



In [54]:
conn = connect_db()
users_df, _, _ = get_data(conn)



In [3]:
translator = Translator()

In [4]:
translator.translate("ich bin gesund").text

'I am healthy'

In [25]:
from_code = "de"
to_code = "en"

# Download and install Argos Translate package
available_packages = argostranslate.package.get_available_packages()

available_package = list(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)[0]
download_path = available_package.download()
argostranslate.package.install_from_path(download_path)

# Translate
installed_languages = argostranslate.translate.get_installed_languages()
from_lang = list(filter(
    lambda x: x.code == from_code,
    installed_languages))[0]
to_lang = list(filter(
    lambda x: x.code == to_code,
    installed_languages))[0]
translation = from_lang.get_translation(to_lang)

In [5]:
def remove_punctuation(text):
    for s in text:
        if s in string.punctuation:
            text = text.replace(s, '')

    return text

In [6]:
def lower_text(text):
    return text.lower()

In [7]:
def remove_numbers(text):
    return ''.join(word for word in text  if not word.isdigit())

In [8]:
def remove_stop_words(text, language='english'):
    lang = detect(text)
    if lang != 'en':
        language = 'german'
    stop_words = set(stopwords.words(language))
    word_tokens = word_tokenize(text)
    
    return ' '.join([w for w in word_tokens if not w in stop_words])

In [9]:
def remove_other_stuff(text):
    # remove http
    text = re.sub(r"http.*\.[a-z]{2,3}","",text)

    # remove www
    text = re.sub(r"www.*\.[a-z]{2,3}","",text)
    
    # remove line breaks
    text = re.sub(r'\n','',text)
    
    return text

In [26]:
def language_translation(string, translator=translation):
    if detect(string) != 'en':
        translatedText = translation.translate(string)
        return translatedText
    else:
        return string

In [11]:
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()

    return ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])

In [12]:
def random_generate_list(x):
    names = list(x.split())
    random.shuffle(names)
    if len(names) > 5:
        names = names[:5]
            
    return names

In [13]:
def random_generate_long_list(x):
    names = list(x.split())
    random.shuffle(names)
    if len(names) > 200:
        names = names[:200]
            
    return names

In [14]:
def create_soup(x):
    soup = (" ".join(x["random_title"]) + " " + " ".join(x["random_description"])).split()
    random.shuffle(soup)
    return ' '.join(soup)

In [15]:
def haversine_vectorized(user_lat,
                         user_lon,
                         event_lat,
                         event_lon):
    """
        Calculate the great circle distance between two points
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df
        Computes distance in kms
    """

    lat_1_rad, lon_1_rad = np.radians(user_lat), np.radians(user_lon)
    lat_2_rad, lon_2_rad = np.radians(event_lat), np.radians(event_lon)
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [28]:
events_df['description_translated'] = events_df.description.apply(language_translation, translator=translation)
events_df['title_translated'] = events_df.title.apply(language_translation, translator=translation)
events_df[['description', 'description_translated']].head()

Unnamed: 0,description,description_translated
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,Two lanes are reserved for us from 7 pm at the...
3,"""Jess X Goh guided me in a process that made m...","""Jess X Goh guided me in a process that made m..."
4,Beginners welcome to join Gaelic football trai...,Beginners welcome to join Gaelic football trai...


In [29]:
events_df['description_punct'] = events_df.description_translated.apply(remove_punctuation)
events_df['title_punct'] = events_df.title_translated.apply(remove_punctuation)
events_df[['description', 'description_punct']].head()

Unnamed: 0,description,description_punct
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,Two lanes are reserved for us from 7 pm at the...
3,"""Jess X Goh guided me in a process that made m...",Jess X Goh guided me in a process that made me...
4,Beginners welcome to join Gaelic football trai...,Beginners welcome to join Gaelic football trai...


In [30]:
events_df['description_lower'] = events_df.description_punct.apply(lower_text)
events_df['title_lower'] = events_df.title_punct.apply(lower_text)
events_df[['description', 'description_lower']].head()

Unnamed: 0,description,description_lower
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,two lanes are reserved for us from 7 pm at the...
3,"""Jess X Goh guided me in a process that made m...",jess x goh guided me in a process that made me...
4,Beginners welcome to join Gaelic football trai...,beginners welcome to join gaelic football trai...


In [31]:
events_df['description_numbers'] = events_df.description_lower.apply(remove_numbers)
events_df['title_numbers'] = events_df.title_lower.apply(remove_numbers)
events_df[['description', 'description_numbers']].head()

Unnamed: 0,description,description_numbers
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,two lanes are reserved for us from pm at the ...
3,"""Jess X Goh guided me in a process that made m...",jess x goh guided me in a process that made me...
4,Beginners welcome to join Gaelic football trai...,beginners welcome to join gaelic football trai...


In [32]:
events_df['description_stopwords'] = events_df.description_numbers.apply(remove_stop_words, language='english')
events_df['title_stopwords'] = events_df.title_numbers.apply(remove_stop_words, language='english')
events_df[['description', 'description_stopwords']].head()

Unnamed: 0,description,description_stopwords
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,two lanes reserved us pm gildebowling wandsbek...
3,"""Jess X Goh guided me in a process that made m...",jess x goh guided process made unleash suppres...
4,Beginners welcome to join Gaelic football trai...,beginners welcome join gaelic football trainin...


In [33]:
events_df['description_others'] = events_df.description_stopwords.apply(remove_other_stuff)
events_df['title_others'] = events_df.title_stopwords.apply(remove_other_stuff)
events_df[['description', 'description_others']].head()

Unnamed: 0,description,description_others
0,no description,no description
1,no description,no description
2,Zwei Bahnen sind für uns ab 19 Uhr im Gilde-Bo...,two lanes reserved us pm gildebowling wandsbek...
3,"""Jess X Goh guided me in a process that made m...",jess x goh guided process made unleash suppres...
4,Beginners welcome to join Gaelic football trai...,beginners welcome join gaelic football trainin...


In [34]:
events_df.to_csv("events_df_savepoint.csv")

In [35]:
events_df = pd.read_csv("events_df_savepoint.csv")

In [None]:
#events_df['description_lemmatize'] = events_df.description_stopwords.apply(lemmatize_text)
#events_df[['description', 'description_lemmatize']].head()

In [36]:
events_df['random_title'] = events_df['title_others'].apply(lambda x: random_generate_list(x))
events_df['random_description'] = events_df['description_others'].apply(lambda x: random_generate_long_list(x))

In [37]:
events_df['soup'] = events_df.apply(create_soup, axis = 1)

In [38]:
tfidf = TfidfVectorizer(stop_words="english")

In [39]:
tfidf_matrix = tfidf.fit_transform(events_df["soup"])
tfidf_matrix.shape

(882, 7183)

In [40]:
cosine_sim = tfidf_matrix.dot(tfidf_matrix.T)

In [41]:
def get_title(user_id, df):
    ''' this finction grabs the title of an event used by an user from ahoy_events dataset'''
    
    searched_events = []
    for event in df[df['user_id'] == user_id].properties.values:
        searched_events.append(event['offer'])

    searched_events = np.unique(searched_events)

    titles = events_df[events_df['id'].isin(searched_events)].title
    
    return titles

In [55]:
print(get_title(59, ahoy_events_df))

202    Outdoor fitness and Irish hurling MeetUp. Begi...
234                              Running for early Birds
236         Berlin Outdoor Fitness Volkspark Wilmersdorf
266    High-Intensity Interval Training (Cardio & Str...
467                Monday expat social volleyball, 7:00.
476    Let's play volleyball and have fun! (DE PIJP A...
477    Let's play volleyball and have fun! (DE PIJP A...
674                           Indoor football - Fussball
677                                  Thursday Volleyball
678        1 hour training and 1 hour playing volleyball
851                           Indoor football - Fussball
Name: title, dtype: object


In [43]:
def get_user_loc(user_id, df = users_df):   
    ''' this finction grabs the location of an user from ahoy_events dataset'''
    
    users_idx = users_df[users_df['id']== user_id]
    # now we get the user's latitude and longitude
    
    location_dict = {'user_latitude' : float(users_idx.latitude.values), 
                     
                     'user_longitude' : float(users_idx.longitude.values)
                        }
     
    return location_dict

In [97]:
titles = get_title(59, ahoy_events_df)
sim_scores = []
for idx in events_df[events_df['title'].isin(titles)].index.unique():
    sim_scores.extend(list(enumerate(cosine_sim[idx].todense().tolist()[0])))

sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

In [105]:
events_df.offer_date

0      2022-06-16
1      2022-07-07
2      2022-06-25
3      2022-07-22
4      2022-06-08
          ...    
877    2022-06-17
878    2022-06-19
879    2022-06-22
880    2022-06-08
881    2022-06-22
Name: offer_date, Length: 882, dtype: object

In [59]:
def content_recommender(user_id, cosine_sim, events_df):
    '''This recommender finds out 20 closest matches with a given event'''
    
    #calling user id from ahoy_events
    
    titles = get_title(user_id, df = ahoy_events_df)
    
    sim_scores = []
    for idx in events_df[events_df['title'].isin(titles)].index.unique():
        sim_scores.extend(list(enumerate(cosine_sim[idx].todense().tolist()[0])))
    
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    # Sort the list of set by sim score, index with highest simlirarity
    # Will be at the beginning of the list
    
    sim_scores = sim_scores[1:101] # top 100 events (first one is our input events so we ignore it)
    
    sport_indices = [i[0] for i in sim_scores] # we grab the indices of those 10 events
    
    df_all_recommendations = events_df.iloc[sport_indices][['id', 'title', 'latitude', 'longitude', 'offer_date']]
    
    df_date_filter = df_all_recommendations[df_all_recommendations['offer_date'] >= '2022-06-11']
    
    #calcullation of haversine distance for users:
    user_latitude = get_user_loc(user_id, df = users_df)['user_latitude']
    user_longitude = get_user_loc(user_id, df = users_df)['user_longitude']
    
    df_date_filter['user_latitude'] = user_latitude
    df_date_filter['user_longitude'] = user_longitude
    
    df_date_filter["distance"] = haversine_vectorized(df_date_filter.latitude, df_date_filter.longitude,
                                                      df_date_filter.user_latitude, df_date_filter.user_longitude)
            
    df_location_filter = df_date_filter[df_date_filter['distance'] <= 200.0]
    
    
                                                
    #return np.unique(df_location_filter.id.values)
    return np.unique(df_date_filter.id.values)

In [None]:
save_npz("model_cosine_sim", cosine_sim)

In [None]:
matrix_reloaded = load_npz("model_cosine_sim.npz")

In [None]:
matrix_reloaded.shape, cosine_sim.shape

In [None]:
events_df[['description']].duplicated().sum()

In [73]:
content_recommender(59, cosine_sim, events_df).tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter['user_latitude'] = user_latitude
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter['user_longitude'] = user_longitude
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter["distance"] = haversine_vectorized(df_date_filter.latitude, df_date_filter.longitude,


[2531,
 2532,
 2560,
 2561,
 2581,
 2582,
 2584,
 2588,
 2591,
 2596,
 2777,
 2782,
 2787,
 2790,
 3007,
 3008,
 3009,
 3010,
 3015,
 3016,
 3017,
 3019,
 3020,
 3021,
 3022,
 3023,
 3024,
 3025]

In [79]:
result = {}
for user_id in users_df.id.values:
    events = content_recommender(user_id, cosine_sim, events_df).tolist()
    if user_id not in result.keys():
        result[str(user_id)] = events

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter['user_latitude'] = user_latitude
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter['user_longitude'] = user_longitude
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_date_filter["distance"] = haversine_vectorized(df_date_filter.latitude, df_date_filter.longitude,
A valu

In [80]:
result

{'55': [2336,
  2337,
  2338,
  2339,
  2376,
  2393,
  2397,
  2398,
  2461,
  2477,
  2511,
  2513,
  2518,
  2598,
  2599,
  2600,
  2660,
  2814,
  2944,
  2947,
  2957,
  2996,
  3086,
  3087,
  3088,
  3092,
  3093,
  3135,
  3193,
  3194,
  3202,
  3203],
 '56': [2364,
  2376,
  2487,
  2660,
  2717,
  2719,
  2731,
  2733,
  2737,
  2753,
  2754,
  2756,
  2794,
  2844,
  2860,
  2866,
  2873,
  2874,
  2875,
  2883,
  2885,
  2898,
  2900,
  2902,
  2903,
  2906,
  2907,
  2908,
  2909,
  2912,
  2913,
  2914,
  2917,
  2950,
  3012,
  3015,
  3016,
  3017,
  3022,
  3024,
  3050,
  3052,
  3054,
  3086,
  3087,
  3088,
  3092,
  3093,
  3098,
  3108,
  3109,
  3110,
  3111,
  3112,
  3115,
  3116,
  3117,
  3167,
  3193,
  3194,
  3198,
  3199,
  3212],
 '57': [],
 '58': [],
 '61': [],
 '62': [],
 '63': [],
 '64': [2342,
  2343,
  2346,
  2347,
  2353,
  2354,
  2379,
  2466,
  2470,
  2474,
  2602,
  2807,
  2814,
  3029,
  3032,
  3033,
  3035,
  3036,
  3037,
  3043,
  305

In [85]:
with open('recommender.json', 'w') as fp:
    json.dump(result, fp)

In [82]:
with open('recommender.json', 'r') as fp:
    data = json.load(fp)

In [87]:
upload_file_to_gs('recommender.json')