In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
games_df = pd.read_csv('dataset_binary_games.csv')
games_df = shuffle(games_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
games_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,is_games
0,1103702902,Red 7 - play Digital Red7 Card Game with Friends,2.99,4,games,1,red official digital mobile version cult card ...,1
1,337056601,n-tv Nachrichten,0.00,12,news,1,die die das den touch das apple die apple watc...,0
2,1165391697,Crazy Clown Chase,0.00,9,games,1,tap carefully bounce way clown clown craze swe...,1
3,866786833,iThoughts (mindmap),11.99,17,productivity,1,tool touch mac version also visually sure righ...,0
4,908456072,Fireboy & Watergirl 2 - The Forest Temple,0.99,4,games,1,fireboy unique game best puzzle game never mix...,1
...,...,...,...,...,...,...,...,...
6235,1100147750,Trailer Truck Parking with Real City Traffic C...,0.00,4,games,1,master every kind parking challenge huge truck...,1
6236,460712294,Kids Doodle - Movie Kids Color & Draw,0.00,4,games,1,doodle pretty painting application designed sp...,1
6237,1101848745,Runaway Toad,2.99,4,games,1,easy green tap hold swipe help toad hop safety...,1
6238,1123579511,Magic Nightfall,0.00,4,games,1,come help princess get back kingdom rescue cut...,1


In [3]:
x_games_df = games_df.iloc[:, [2,3,5,6]]
y_games_df = games_df.iloc[:, [7]]

In [4]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [5]:
base_vectorizer = tf_idf(games_df.app_desc, max_feat=24)

In [6]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [7]:
def build_ml(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score_mlt = accuracy_score(predictions_mnb, test_y)*100
    bernoulli = BernoulliNB()
    bernoulli.fit(train_x, train_y.ravel())
    predictions_bernoulli = bernoulli.predict(test_x)
    score_bernoulli = accuracy_score(predictions_bernoulli, test_y)*100
    print("acc score -> bernoulli : {} multinomial : {}".format(score_bernoulli, score_mlt))
    return score_mlt, score_bernoulli

In [8]:
def get_result(fold_list, x_df, y_df):
    test_result = {'multinomial':{},'bernoulli':{}}
    for k_fold in fold_list:
        kf = KFold(n_splits=k_fold)
        print("test on fold: ", k_fold)
        all_score_mlt = []
        all_score_bernoulli = []
        for train_index, test_index in kf.split(x_df):
            score_mlt, score_bernoulli = build_ml(
                train = x_df.iloc[train_index,:], 
                test = x_df.iloc[test_index,:], 
                train_y = np.array(y_df.iloc[train_index,:]),
                test_y = np.array(y_df.iloc[test_index,:]),
                vectorizer = base_vectorizer
            )
            all_score_mlt.append(score_mlt)
            all_score_bernoulli.append(score_bernoulli)
        test_result['multinomial'][str(k_fold)] = np.average(all_score_mlt)
        test_result['bernoulli'][str(k_fold)] = np.average(all_score_bernoulli)
    return test_result

In [9]:
games_result = get_result(
    fold_list = [2,6,10],
    x_df = x_games_df,
    y_df = y_games_df
)
games_result

test on fold:  2
acc score -> bernoulli : 57.98076923076923 multinomial : 57.85256410256411
acc score -> bernoulli : 61.37820512820513 multinomial : 56.506410256410255
test on fold:  6
acc score -> bernoulli : 60.96153846153847 multinomial : 60.67307692307692
acc score -> bernoulli : 80.1923076923077 multinomial : 66.82692307692307
acc score -> bernoulli : 75.28846153846153 multinomial : 64.03846153846153
acc score -> bernoulli : 50.19230769230769 multinomial : 51.24999999999999
acc score -> bernoulli : 81.53846153846153 multinomial : 73.17307692307692
acc score -> bernoulli : 54.03846153846153 multinomial : 51.34615384615384
test on fold:  10
acc score -> bernoulli : 59.61538461538461 multinomial : 58.493589743589745
acc score -> bernoulli : 62.019230769230774 multinomial : 58.81410256410257
acc score -> bernoulli : 79.32692307692307 multinomial : 68.42948717948718
acc score -> bernoulli : 46.31410256410257 multinomial : 50.641025641025635
acc score -> bernoulli : 62.980769230769226 m

{'multinomial': {'2': 57.17948717948718,
  '6': 61.21794871794871,
  '10': 60.51282051282051},
 'bernoulli': {'2': 59.67948717948718,
  '6': 67.03525641025641,
  '10': 64.34294871794872}}

In [10]:
all_df = pd.read_csv('dataset_all_category.csv')
all_df = shuffle(all_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
all_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc
0,1142401623,Lip Sync Battle App,0.00,12,entertainment,1,join party step world lip sync battle find inn...
1,1001177846,Find–the–Line,2.99,4,games,1,transform simple works art swipe finger unique...
2,482475948,Math Evolve: A Fun Math Game For Kids,1.99,4,education,1,winner best educational game best ever holy gr...
3,386505750,Lotto Results - Mega Millions Powerball Lottery,0.00,12,news,1,quick easy access millions state lottery winni...
4,600093562,LGV Theory Test and Hazard Perception,9.99,4,education,1,official theory test preparation hazard percep...
...,...,...,...,...,...,...,...
6772,1006850128,Ice Queen Prom Salon: Princess Makeover Girls ...,1.99,4,games,1,fully unlocked version one year crowned snow q...
6773,1084805156,Card Wars Kingdom - Adventure Time,0.00,12,games,1,note game touch th gen card next level card cr...
6774,1085932108,CUBE 360°　～想像力×知能×反射神経～,0.00,9,games,1,cube cub
6775,1092447708,Mr.Luma's Super Flight,0.99,4,education,1,super flight exciting unique flight experience...


In [11]:
genre_list = list(set(all_df['prime_genre'].to_list()))
genre_list

['lifestyle',
 'utilities',
 'business',
 'health & fitness',
 'photo & video',
 'weather',
 'news',
 'entertainment',
 'productivity',
 'social networking',
 'music',
 'catalogs',
 'travel',
 'games',
 'book',
 'shopping',
 'finance',
 'reference',
 'sports',
 'navigation',
 'food & drink',
 'medical',
 'education']

In [12]:
le = preprocessing.LabelEncoder()
le.fit(genre_list)

LabelEncoder()

In [13]:
def encode_class(str_class):
    return le.transform([str_class])[0]

In [14]:
all_df['encode_class'] = all_df['prime_genre'].apply(encode_class)
all_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,encode_class
0,1142401623,Lip Sync Battle App,0.00,12,entertainment,1,join party step world lip sync battle find inn...,4
1,1001177846,Find–the–Line,2.99,4,games,1,transform simple works art swipe finger unique...,7
2,482475948,Math Evolve: A Fun Math Game For Kids,1.99,4,education,1,winner best educational game best ever holy gr...,3
3,386505750,Lotto Results - Mega Millions Powerball Lottery,0.00,12,news,1,quick easy access millions state lottery winni...,13
4,600093562,LGV Theory Test and Hazard Perception,9.99,4,education,1,official theory test preparation hazard percep...,3
...,...,...,...,...,...,...,...,...
6772,1006850128,Ice Queen Prom Salon: Princess Makeover Girls ...,1.99,4,games,1,fully unlocked version one year crowned snow q...,7
6773,1084805156,Card Wars Kingdom - Adventure Time,0.00,12,games,1,note game touch th gen card next level card cr...,7
6774,1085932108,CUBE 360°　～想像力×知能×反射神経～,0.00,9,games,1,cube cub,7
6775,1092447708,Mr.Luma's Super Flight,0.99,4,education,1,super flight exciting unique flight experience...,3


In [15]:
x_all_df = all_df.iloc[:, [2,3,5,6]]
y_all_df = all_df.iloc[:, [7]]

In [16]:
all_result = get_result(
    fold_list = [2,6,10],
    x_df = x_all_df,
    y_df = y_all_df
)
all_result

test on fold:  2
acc score -> bernoulli : 37.53319563293007 multinomial : 50.89997049277073
acc score -> bernoulli : 40.70247933884297 multinomial : 50.29515938606848
test on fold:  6
acc score -> bernoulli : 46.902654867256636 multinomial : 51.415929203539825
acc score -> bernoulli : 44.51327433628319 multinomial : 51.32743362831859
acc score -> bernoulli : 37.52212389380531 multinomial : 50.0
acc score -> bernoulli : 29.583702391496903 multinomial : 47.9185119574845
acc score -> bernoulli : 36.315323294951284 multinomial : 49.77856510186005
acc score -> bernoulli : 47.9185119574845 multinomial : 52.96722763507529
test on fold:  10
acc score -> bernoulli : 33.6283185840708 multinomial : 52.8023598820059
acc score -> bernoulli : 36.87315634218289 multinomial : 48.67256637168141
acc score -> bernoulli : 30.8259587020649 multinomial : 49.41002949852507
acc score -> bernoulli : 27.876106194690266 multinomial : 50.442477876106196
acc score -> bernoulli : 36.283185840707965 multinomial : 49

{'multinomial': {'2': 50.5975649394196,
  '6': 50.56794458771304,
  '10': 50.14045568031791},
 'bernoulli': {'2': 39.11783748588652,
  '6': 40.45926512354631,
  '10': 34.808847814625516}}