In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [11]:
games_df = pd.read_csv('dataset_binary_games.csv')
games_df = shuffle(games_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
games_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,is_games
0,1103702902,Red 7 - play Digital Red7 Card Game with Friends,2.99,4,games,1,red official digital mobile version cult card ...,1
1,337056601,n-tv Nachrichten,0.00,12,news,1,die die das den touch das apple die apple watc...,0
2,1165391697,Crazy Clown Chase,0.00,9,games,1,tap carefully bounce way clown clown craze swe...,1
3,866786833,iThoughts (mindmap),11.99,17,productivity,1,tool touch mac version also visually sure righ...,0
4,908456072,Fireboy & Watergirl 2 - The Forest Temple,0.99,4,games,1,fireboy unique game best puzzle game never mix...,1
...,...,...,...,...,...,...,...,...
6235,1100147750,Trailer Truck Parking with Real City Traffic C...,0.00,4,games,1,master every kind parking challenge huge truck...,1
6236,460712294,Kids Doodle - Movie Kids Color & Draw,0.00,4,games,1,doodle pretty painting application designed sp...,1
6237,1101848745,Runaway Toad,2.99,4,games,1,easy green tap hold swipe help toad hop safety...,1
6238,1123579511,Magic Nightfall,0.00,4,games,1,come help princess get back kingdom rescue cut...,1


In [12]:
x_games_df = games_df.iloc[:, [2,3,5,6]]
y_games_df = games_df.iloc[:, [7]]

In [13]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [14]:
base_vectorizer = tf_idf(games_df.app_desc, max_feat=24)

In [15]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [16]:
def build_mn(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score_mlt = accuracy_score(predictions_mnb, test_y)*100
    bernoulli = BernoulliNB()
    bernoulli.fit(train_x, train_y.ravel())
    predictions_bernoulli = bernoulli.predict(test_x)
    score_bernoulli = accuracy_score(predictions_bernoulli, test_y)*100
    print("acc score -> bernoulli : {} multinomial : {}".format(score_mlt, score_bernoulli))
    return score_mlt, score_bernoulli

In [17]:
def get_result(fold_list, x_df, y_df):
    test_result = {'multinomial':{},'bernoulli':{}}
    for k_fold in fold_list:
        kf = KFold(n_splits=k_fold)
        print("test on fold: ", k_fold)
        all_score_mlt = []
        all_score_bernoulli = []
        for train_index, test_index in kf.split(x_df):
            score_mlt, score_bernoulli = build_mn(
                train = x_df.iloc[train_index,:], 
                test = x_df.iloc[test_index,:], 
                train_y = np.array(y_df.iloc[train_index,:]),
                test_y = np.array(y_df.iloc[test_index,:]),
                vectorizer = base_vectorizer
            )
            all_score_mlt.append(score_mlt)
            all_score_bernoulli.append(score_bernoulli)
        test_result['multinomial'][str(k_fold)] = np.average(all_score_mlt)
        test_result['bernoulli'][str(k_fold)] = np.average(all_score_bernoulli)
    return test_result

In [42]:
games_result = get_result(
    fold_list = [2,10,20,40],
    x_df = x_games_df,
    y_df = y_games_df
)
games_result

test on fold:  2
acc score -> bernoulli : 57.85256410256411 multinomial : 57.98076923076923
acc score -> bernoulli : 56.506410256410255 multinomial : 61.37820512820513
test on fold:  10
acc score -> bernoulli : 58.493589743589745 multinomial : 59.61538461538461
acc score -> bernoulli : 58.81410256410257 multinomial : 62.019230769230774
acc score -> bernoulli : 68.42948717948718 multinomial : 79.32692307692307
acc score -> bernoulli : 50.641025641025635 multinomial : 46.31410256410257
acc score -> bernoulli : 61.37820512820513 multinomial : 62.980769230769226
acc score -> bernoulli : 70.51282051282051 multinomial : 82.53205128205127
acc score -> bernoulli : 49.519230769230774 multinomial : 53.52564102564102
acc score -> bernoulli : 74.03846153846155 multinomial : 79.96794871794873
acc score -> bernoulli : 54.166666666666664 multinomial : 56.891025641025635
acc score -> bernoulli : 59.13461538461539 multinomial : 60.256410256410255
test on fold:  20
acc score -> bernoulli : 61.8589743589

{'multinomial': {'2': 57.17948717948718,
  '10': 60.51282051282051,
  '20': 57.19551282051282,
  '40': 57.25961538461538},
 'bernoulli': {'2': 59.67948717948718,
  '10': 64.34294871794872,
  '20': 59.07051282051283,
  '40': 60.368589743589745}}

In [32]:
all_df = pd.read_csv('dataset_all_category.csv')
all_df = shuffle(all_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
all_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc
0,1142401623,Lip Sync Battle App,0.00,12,entertainment,1,join party step world lip sync battle find inn...
1,1001177846,Find–the–Line,2.99,4,games,1,transform simple works art swipe finger unique...
2,482475948,Math Evolve: A Fun Math Game For Kids,1.99,4,education,1,winner best educational game best ever holy gr...
3,386505750,Lotto Results - Mega Millions Powerball Lottery,0.00,12,news,1,quick easy access millions state lottery winni...
4,600093562,LGV Theory Test and Hazard Perception,9.99,4,education,1,official theory test preparation hazard percep...
...,...,...,...,...,...,...,...
6772,1006850128,Ice Queen Prom Salon: Princess Makeover Girls ...,1.99,4,games,1,fully unlocked version one year crowned snow q...
6773,1084805156,Card Wars Kingdom - Adventure Time,0.00,12,games,1,note game touch th gen card next level card cr...
6774,1085932108,CUBE 360°　～想像力×知能×反射神経～,0.00,9,games,1,cube cub
6775,1092447708,Mr.Luma's Super Flight,0.99,4,education,1,super flight exciting unique flight experience...


In [33]:
genre_list = list(set(all_df['prime_genre'].to_list()))
genre_list

['photo & video',
 'education',
 'news',
 'productivity',
 'entertainment',
 'food & drink',
 'reference',
 'finance',
 'shopping',
 'health & fitness',
 'travel',
 'games',
 'book',
 'sports',
 'medical',
 'navigation',
 'weather',
 'catalogs',
 'business',
 'music',
 'social networking',
 'lifestyle',
 'utilities']

In [34]:
le = preprocessing.LabelEncoder()
le.fit(genre_list)

LabelEncoder()

In [35]:
def encode_class(str_class):
    return le.transform([str_class])[0]

In [36]:
all_df['encode_class'] = all_df['prime_genre'].apply(encode_class)
all_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,encode_class
0,1142401623,Lip Sync Battle App,0.00,12,entertainment,1,join party step world lip sync battle find inn...,4
1,1001177846,Find–the–Line,2.99,4,games,1,transform simple works art swipe finger unique...,7
2,482475948,Math Evolve: A Fun Math Game For Kids,1.99,4,education,1,winner best educational game best ever holy gr...,3
3,386505750,Lotto Results - Mega Millions Powerball Lottery,0.00,12,news,1,quick easy access millions state lottery winni...,13
4,600093562,LGV Theory Test and Hazard Perception,9.99,4,education,1,official theory test preparation hazard percep...,3
...,...,...,...,...,...,...,...,...
6772,1006850128,Ice Queen Prom Salon: Princess Makeover Girls ...,1.99,4,games,1,fully unlocked version one year crowned snow q...,7
6773,1084805156,Card Wars Kingdom - Adventure Time,0.00,12,games,1,note game touch th gen card next level card cr...,7
6774,1085932108,CUBE 360°　～想像力×知能×反射神経～,0.00,9,games,1,cube cub,7
6775,1092447708,Mr.Luma's Super Flight,0.99,4,education,1,super flight exciting unique flight experience...,3


In [38]:
x_all_df = all_df.iloc[:, [2,3,5,6]]
y_all_df = all_df.iloc[:, [7]]

In [41]:
all_result = get_result(
    fold_list = [2,10,20,40],
    x_df = x_all_df,
    y_df = y_all_df
)
all_result

test on fold:  2
acc score -> bernoulli : 50.89997049277073 multinomial : 37.53319563293007
acc score -> bernoulli : 50.29515938606848 multinomial : 40.70247933884297
test on fold:  10
acc score -> bernoulli : 52.8023598820059 multinomial : 33.6283185840708
acc score -> bernoulli : 48.67256637168141 multinomial : 36.87315634218289
acc score -> bernoulli : 49.41002949852507 multinomial : 30.8259587020649
acc score -> bernoulli : 50.442477876106196 multinomial : 27.876106194690266
acc score -> bernoulli : 49.852507374631266 multinomial : 36.283185840707965
acc score -> bernoulli : 46.16519174041298 multinomial : 28.31858407079646
acc score -> bernoulli : 51.76991150442478 multinomial : 50.29498525073747
acc score -> bernoulli : 47.71048744460857 multinomial : 30.132939438700145
acc score -> bernoulli : 53.4711964549483 multinomial : 41.06351550960118
acc score -> bernoulli : 51.10782865583457 multinomial : 32.791728212703106
test on fold:  20
acc score -> bernoulli : 56.63716814159292 mu

{'multinomial': {'2': 50.5975649394196,
  '10': 50.14045568031791,
  '20': 50.229355396135524,
  '40': 50.49477897667943},
 'bernoulli': {'2': 39.11783748588652,
  '10': 34.808847814625516,
  '20': 39.029428706079486,
  '40': 39.524103724329976}}