In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

data = pd.read_csv('datasets/tripadvisor_merged.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Address,Description,FeatureCount,Fee,Langtitude,Latitude,LengthOfVisit,MuseumName,PhoneNum,...,review_pol_var,review_pol_mean,review_pol_max,review_pol_min,review_sub_var,review_sub_mean,review_sub_max,review_sub_min,descri_pol,descri_sub
0,0,"555 Pennsylvania Ave NW, Washington DC, DC 200...",Find out for yourself why everyone is calling ...,3.0,Yes,-77.019235,38.893138,2-3 hours,Newseum,+1 888-639-7386,...,0.02376,0.334677,0.611111,0.0,0.014521,0.509036,0.754861,0.342857,0.49,0.506667
1,4,"1000 5th Ave, New York City, NY 10028-0198",At New York City's most visited museum and att...,12.0,Yes,-73.962928,40.779166,2-3 hours,The Metropolitan Museum of Art,1 212-535-7710,...,0.018057,0.318339,0.634375,0.127083,0.01828,0.537971,0.7,0.291667,0.318182,0.477273
2,5,"945 Magazine Street, New Orleans, LA 70130-3813","Founded by historian and author, Stephen Ambro...",11.0,,-90.070086,29.943004,,The National WWII Museum,+1 504-528-1944,...,0.010222,0.333493,0.5,0.140783,0.0191,0.508205,0.804861,0.333333,0.0,0.0
3,6,"2001 N Colorado Blvd, Denver, CO 80205-5798",The Denver Museum of Nature & Science is the R...,0.0,,-104.94102,39.769189,,Denver Museum of Nature & Science,303-370-6000,...,0.022625,0.429964,0.664286,0.192262,0.014845,0.584561,0.783333,0.438095,0.1,0.4
4,7,"111 S Michigan Ave, Chicago, IL 60603-6488","This Classical Renaissance structure, guarded ...",5.0,Yes,-87.623724,41.879547,More than 3 hours,Art Institute of Chicago,312 443 3600,...,0.030203,0.299117,0.681333,0.125,0.02538,0.530728,0.788333,0.266667,0.3,0.3375


In [2]:
required_col = ['Description',
               'MuseumName',
               'Rating',
               'ReviewCount',
                'Rank',
               'Country']
# to_drop = ['Size',
#           'Last Updated',
#           'Current Ver',
#           'Android Ver']
# data.drop(to_drop, inplace=True, axis=1)
# df = df[col_list]
data = data[required_col]
data.head()


Unnamed: 0,Description,MuseumName,Rating,ReviewCount,Rank,Country
0,Find out for yourself why everyone is calling ...,Newseum,4.5,6309.0,8.0,USA
1,At New York City's most visited museum and att...,The Metropolitan Museum of Art,5.0,36627.0,2.0,USA
2,"Founded by historian and author, Stephen Ambro...",The National WWII Museum,5.0,15611.0,1.0,USA
3,The Denver Museum of Nature & Science is the R...,Denver Museum of Nature & Science,4.5,2564.0,4.0,USA
4,"This Classical Renaissance structure, guarded ...",Art Institute of Chicago,5.0,15532.0,1.0,USA


In [3]:
def clean_data(data):
    # Fixing Price
    data = data.where(data['Price'] != "Everyone")
    data["Price"] = data["Price"].str.replace("$", '')
    data["Price"] = pd.to_numeric(data["Price"])

    # Fixing Installs
    data["Installs"] = data["Installs"].str.replace(",", '')
    data["Installs"] = data["Installs"].str.replace("+", '')
    data["Installs"] = pd.to_numeric(data["Installs"])

    data["Reviews"] = pd.to_numeric(data["Reviews"])
    data['Genres'] = data['Genres'].fillna('')
    
    data = data.dropna()
        
    return data

# data = clean_data(data)

# data['AG'] = data[['App', 'Genres']].apply(lambda x: ''.join(x), axis=1)
# data["AppName_Genres"] = data["App"].map(str) + " - " + data["Genres"]

data['Description'] = data['Description'].astype(str)
data["Rating"] = pd.to_numeric(data["Rating"])
data["ReviewCount"] = pd.to_numeric(data["ReviewCount"])
data.head()

Unnamed: 0,Description,MuseumName,Rating,ReviewCount,Rank,Country
0,Find out for yourself why everyone is calling ...,Newseum,4.5,6309.0,8.0,USA
1,At New York City's most visited museum and att...,The Metropolitan Museum of Art,5.0,36627.0,2.0,USA
2,"Founded by historian and author, Stephen Ambro...",The National WWII Museum,5.0,15611.0,1.0,USA
3,The Denver Museum of Nature & Science is the R...,Denver Museum of Nature & Science,4.5,2564.0,4.0,USA
4,"This Classical Renaissance structure, guarded ...",Art Institute of Chicago,5.0,15532.0,1.0,USA


In [4]:
# GENRE BASED RECOMMENDER
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data['Description'])

# print(tf.vocabulary_) 


In [5]:
txt_fitted = tf.fit(data['Description'])
idf = tf.idf_
# print(dict(zip(txt_fitted.get_feature_names(), idf)))
tfidf_matrix.shape

(1603, 45988)

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.        , 0.00574816, 0.00535346, ..., 0.011308  , 0.        ,
       0.        ])

In [7]:
data = data.reset_index()
indices = pd.Series(data.index, index=data['MuseumName'])


def get_recommendations(app_name):
    idx = indices[app_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    app_indices = [i[0] for i in sim_scores]
    new = data.iloc[app_indices]
#     return new[1:20]
    return ranked_recommendations(new[1:10])


def ranked_recommendations(recommended_data):
    C = recommended_data['Rating'].mean()
    m = recommended_data['ReviewCount'].quantile(0.10)
    final_apps = recommended_data.copy().loc[data['ReviewCount'] >= m]
    final_apps.shape
    
    def weighted_rating(x, m=m, C=C):
        v = x['Rating']
        R = x['ReviewCount']
        return (v /(v + m) * R) + (m / (m + v) * C)

    final_apps['score'] = final_apps.apply(weighted_rating, axis=1)
    final_apps = final_apps.sort_values('score', ascending=False)

    return final_apps



In [8]:
get_recommendations('New York State Museum').head(100)

Unnamed: 0,index,Description,MuseumName,Rating,ReviewCount,Rank,Country,score
1,1,At New York City's most visited museum and att...,The Metropolitan Museum of Art,5.0,36627.0,2.0,USA,832.314195
1367,1367,This satellite gallery of the famous Guggenhei...,Peggy Guggenheim Collection,4.5,5306.0,12.0,Other,112.595831
487,487,Just a short walk from Grand Central and Penn ...,The Morgan Library & Museum,4.5,1117.0,1.0,USA,27.183507
208,208,"The New York Transit Museum, one of the city's...",New York Transit Museum,4.5,627.0,13.0,USA,17.192569
481,481,"A museum, library, and research center that in...",The Wolfsonian - Florida International University,4.5,438.0,15.0,USA,13.338922
113,113,A trip to New York wouldn't be complete withou...,New-York Historical Society Museum & Library,4.0,479.0,119.0,USA,13.119437
505,505,The Jewish Museum of New York offers acclaimed...,The Jewish Museum,4.5,274.0,108.0,USA,9.995016
550,550,The Rubin Museum of Art is an arts oasis and c...,Rubin Museum of Art,4.5,232.0,82.0,USA,9.13865


## Evaluation



## Conclusion



## References

[1] https://www.kaggle.com/lava18/google-play-store-apps#googleplaystore_user_reviews.csv
