## SENG 474 Data Mining Project

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

data = pd.read_csv('google-play-store-apps/googleplaystore.csv')

## Data Preprocessing

After importing the data set, the columns that won't be required for the mining and recommendation engine are dropped. The size of the app, last updated date, current version number and the android version number do not have an effect on the analysis and on the recommender engine. Hence these columns are dropped. The dataset after this cleanup is shown below.

In [2]:
to_drop = ['Size',
          'Last Updated',
          'Current Ver',
          'Android Ver']
data.drop(to_drop, inplace=True, axis=1)
data.head()


Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,"10,000+",Free,0,Everyone,Art & Design
1,Coloring book moana,ART_AND_DESIGN,3.9,967,"500,000+",Free,0,Everyone,Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,"5,000,000+",Free,0,Everyone,Art & Design
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,"50,000,000+",Free,0,Teen,Art & Design
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,"100,000+",Free,0,Everyone,Art & Design;Creativity


### Data Cleanup

It was found that a lot of the numberical data from the data set were stored as strings, some even containing charecters like "&" and "+". Hence as part of the mining process, the data was cleaned by removing inconsistent rows, invalid characters and by converting string numbers to actual numeric values to aid in the numerical analysis. The data after cleanup is shown below.

In [3]:
def clean_data(data):
    # Fixing Price
    data = data.where(data['Price'] != "Everyone")
    data["Price"] = data["Price"].str.replace("$", '')
    data["Price"] = pd.to_numeric(data["Price"])

    # Fixing Installs
    data["Installs"] = data["Installs"].str.replace(",", '')
    data["Installs"] = data["Installs"].str.replace("+", '')
    data["Installs"] = pd.to_numeric(data["Installs"])

    data["Reviews"] = pd.to_numeric(data["Reviews"])
    data['Genres'] = data['Genres'].fillna('')
    return data

data = clean_data(data)

# data['AG'] = data[['App', 'Genres']].apply(lambda x: ''.join(x), axis=1)
data["AppName_Genres"] = data["App"].map(str) + " - " + data["Genres"]
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,AppName_Genres
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159.0,10000.0,Free,0.0,Everyone,Art & Design,Photo Editor & Candy Camera & Grid & ScrapBook...
1,Coloring book moana,ART_AND_DESIGN,3.9,967.0,500000.0,Free,0.0,Everyone,Art & Design;Pretend Play,Coloring book moana - Art & Design;Pretend Play
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510.0,5000000.0,Free,0.0,Everyone,Art & Design,"U Launcher Lite – FREE Live Cool Themes, Hide ..."
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644.0,50000000.0,Free,0.0,Teen,Art & Design,Sketch - Draw & Paint - Art & Design
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967.0,100000.0,Free,0.0,Everyone,Art & Design;Creativity,Pixel Draw - Number Art Coloring Book - Art & ...


## Data Mining

### Genres Based Recommender System

For providing an app recommendation, the similarity between apps based on a certain metric is computed. For this recommendation system the main metric used is the "Genre" of the app. Content based filtering is done on the dataset to retrieve apps that are most similar to each other based on Genre. 

### TF-IDF Matrix

A fitted term frequency–inverse document frequency is generated for the apps' generes. The vocabulary set for the dataset can be seen below.


In [4]:
# GENRE BASED RECOMMENDER
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(data['Genres'])

print(tf.vocabulary_) 


{'art': 10, 'design': 44, 'art design': 11, 'pretend': 117, 'play': 109, 'design pretend': 47, 'pretend play': 118, 'creativity': 41, 'design creativity': 46, 'action': 0, 'adventure': 3, 'design action': 45, 'action adventure': 2, 'auto': 14, 'vehicles': 151, 'auto vehicles': 15, 'beauty': 16, 'books': 21, 'reference': 128, 'books reference': 22, 'business': 25, 'comics': 37, 'comics creativity': 38, 'communication': 39, 'dating': 42, 'education': 52, 'education education': 56, 'education creativity': 55, 'music': 96, 'video': 152, 'education music': 57, 'music video': 99, 'education action': 53, 'education pretend': 58, 'brain': 23, 'games': 79, 'education brain': 54, 'brain games': 24, 'entertainment': 65, 'entertainment music': 70, 'entertainment brain': 67, 'entertainment creativity': 68, 'events': 72, 'finance': 73, 'food': 77, 'drink': 48, 'food drink': 78, 'health': 80, 'fitness': 74, 'health fitness': 81, 'house': 83, 'home': 82, 'house home': 84, 'libraries': 85, 'demo': 43, 

The inverse document frequency can be seen here

In [5]:
txt_fitted = tf.fit(data['Genres'])
idf = tf.idf_
print(dict(zip(txt_fitted.get_feature_names(), idf)))
tfidf_matrix.shape

{'action': 4.068606491748914, 'action action': 7.400811001924119, 'action adventure': 5.356708826689592, 'adventure': 4.9112854062798235, 'adventure action': 7.652125430205025, 'adventure brain': 9.598035579260339, 'adventure education': 9.192570471152173, 'arcade': 4.818912086148808, 'arcade action': 7.457969415764067, 'arcade pretend': 9.598035579260339, 'art': 6.042687517770925, 'art design': 6.042687517770925, 'audio': 9.598035579260339, 'audio music': 9.598035579260339, 'auto': 5.836835463566776, 'auto vehicles': 5.836835463566776, 'beauty': 6.302198713256009, 'board': 6.132299676460612, 'board action': 8.904888398700393, 'board brain': 7.5185940375805025, 'board pretend': 9.598035579260339, 'books': 4.831597245676124, 'books reference': 4.831597245676124, 'brain': 6.028502882778968, 'brain games': 6.028502882778968, 'business': 4.157784716823635, 'card': 6.339939041238856, 'card action': 9.192570471152173, 'card brain': 9.598035579260339, 'casino': 6.602303305706347, 'casual': 4.

(10841, 156)

### Similarity Scores

With the TF-IDF matrix, similarity scores for the recommendation were calculated. For this project the cosine similarity scoring was used. 

In [6]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]

array([1.       , 0.6101111, 1.       , ..., 0.       , 0.       ,
       0.       ])

By doing so, we now have a pairwise cosine similarity matrix for all the apps in the dataset.

The function `get_recommendation` uses the calculated similarity scores to get top 50 recommendations for the given app name by getting a sorted list of the similarity scores and mapping them to the apps by indices. 

These 50 recommended apps are then ranked based on their ratings and reviews by the `ranked_recommendation` function that implements a Bayesian  estimate to finally return the top 100 recommended apps for the given app.

In [7]:
data = data.reset_index()
indices = pd.Series(data.index, index=data['App'])


def get_recommendations(app_name):
    idx = indices[app_name]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]
    app_indices = [i[0] for i in sim_scores]
    new = data.iloc[app_indices]
    return ranked_recommendations(new)


def ranked_recommendations(recommended_data):
    C = recommended_data['Rating'].mean()
    m = recommended_data['Reviews'].quantile(0.10)
    final_apps = recommended_data.copy().loc[data['Reviews'] >= m]
    final_apps.shape
    
    def weighted_rating(x, m=m, C=C):
        v = x['Reviews']
        R = x['Rating']
        return (v /(v + m) * R) + (m / (m + v) * C)

    final_apps['score'] = final_apps.apply(weighted_rating, axis=1)
    final_apps = final_apps.sort_values('score', ascending=False)

    return final_apps



## Results


In [8]:
get_recommendations('German Vocabulary Trainer').head(100)

Unnamed: 0,index,App,Category,Rating,Reviews,Installs,Type,Price,Content Rating,Genres,AppName_Genres,score
707,707,English Grammar Test,EDUCATION,4.8,4075.0,500000.0,Free,0.0,Everyone,Education,English Grammar Test - Education,4.733884
737,737,"Learn languages, grammar & vocabulary with Mem...",EDUCATION,4.7,1107903.0,10000000.0,Free,0.0,Everyone,Education,"Learn languages, grammar & vocabulary with Mem...",4.699783
703,703,Learn English with Wlingua,EDUCATION,4.7,314299.0,10000000.0,Free,0.0,Everyone,Education,Learn English with Wlingua - Education,4.699238
736,736,English with Lingualeo,EDUCATION,4.7,254519.0,5000000.0,Free,0.0,Everyone,Education,English with Lingualeo - Education,4.69906
761,761,Learn Java,EDUCATION,4.7,52743.0,1000000.0,Free,0.0,Everyone,Education,Learn Java - Education,4.695522
706,706,Learn Spanish - Español,EDUCATION,4.7,32346.0,1000000.0,Free,0.0,Everyone,Education,Learn Spanish - Español - Education,4.692772
731,731,English words application mikan,EDUCATION,4.7,9888.0,500000.0,Free,0.0,Everyone,Education,English words application mikan - Education,4.677678
701,701,English Communication - Learn English for Chin...,EDUCATION,4.7,2544.0,100000.0,Free,0.0,Everyone,Education,English Communication - Learn English for Chin...,4.629608
740,740,Quizlet: Learn Languages & Vocab with Flashcards,EDUCATION,4.6,211856.0,10000000.0,Free,0.0,Everyone,Education,Quizlet: Learn Languages & Vocab with Flashcar...,4.599279
739,739,TED,EDUCATION,4.6,181893.0,10000000.0,Free,0.0,Everyone 10+,Education,TED - Education,4.599161


In [9]:
def count_unique(col):
    unique, counts = np.unique(col, return_counts=True)
    return np.array(list(zip(unique, counts)), dtype=object)

X = data.values

def plot_bar(col, title, color=None):
    category = count_unique(col)
    fig, ax = plt.subplots(figsize=(18,8))
    y_pos = np.arange(len(category[:,0]))
    plt.bar(y_pos, category[:,1], color=color)
    plt.xticks(y_pos, category[:, 0])
    plt.xticks(rotation=75)
    fig.suptitle(title, fontsize=20)
    
def plot_pie(col, title, color=None):
    category = count_unique(col)
    fig, ax = plt.subplots(figsize=(18,8))
    plt.pie(category[:,1], autopct='%.2f%%', labels=category[:,0])
    fig.suptitle(title, fontsize=20)
    
plot_bar(X[:, 1], 'Distribution of Categories')

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
plot_bar(X[:, 2], 'Distribution of Ratings', 'C1')

In [None]:
plot_bar(X[:, 4], 'Distribution of Installs', 'C7')

In [None]:
plot_pie(X[:, 5], 'Type', 'C2')

In [None]:
plot_bar(X[:, 6], 'Price', 'C3')

In [None]:
plot_bar(X[X[:, 6]>0][:, 6], 'Price Without Free', 'C3')

In [None]:
plot_bar(X[:, 7], 'Content Rating', 'C4')