In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [2]:
games_df = pd.read_csv('dataset_binary_games.csv')
games_df = shuffle(games_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
games_df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,is_games
0,1103702902,Red 7 - play Digital Red7 Card Game with Friends,2.99,4,games,1,red official digital mobile version cult card ...,1
1,337056601,n-tv Nachrichten,0.00,12,news,1,die die das den touch das apple die apple watc...,0
2,1165391697,Crazy Clown Chase,0.00,9,games,1,tap carefully bounce way clown clown craze swe...,1
3,866786833,iThoughts (mindmap),11.99,17,productivity,1,tool touch mac version also visually sure righ...,0
4,908456072,Fireboy & Watergirl 2 - The Forest Temple,0.99,4,games,1,fireboy unique game best puzzle game never mix...,1
...,...,...,...,...,...,...,...,...
6235,1100147750,Trailer Truck Parking with Real City Traffic C...,0.00,4,games,1,master every kind parking challenge huge truck...,1
6236,460712294,Kids Doodle - Movie Kids Color & Draw,0.00,4,games,1,doodle pretty painting application designed sp...,1
6237,1101848745,Runaway Toad,2.99,4,games,1,easy green tap hold swipe help toad hop safety...,1
6238,1123579511,Magic Nightfall,0.00,4,games,1,come help princess get back kingdom rescue cut...,1


In [3]:
x_games_df = games_df.iloc[:, [2,3,5,6]]
y_games_df = games_df.iloc[:, [7]]

In [4]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [5]:
base_vectorizer = tf_idf(games_df.app_desc, max_feat=600)

In [6]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [7]:
def build_mn(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score_mlt = accuracy_score(predictions_mnb, test_y)*100
    bernoulli = BernoulliNB()
    bernoulli.fit(train_x, train_y.ravel())
    predictions_bernoulli = bernoulli.predict(test_x)
    score_bernoulli = accuracy_score(predictions_bernoulli, test_y)*100
    print("acc score -> bernoulli : {} multinomial : {}".format(score_mlt, score_bernoulli))
    return score_mlt, score_bernoulli

In [12]:
def get_result(fold_list, x_df, y_df):
    test_result = {'multinomial':{},'bernoulli':{}}
    for k_fold in fold_list:
        kf = KFold(n_splits=k_fold)
        print("test on fold: ", k_fold)
        all_score_mlt = []
        all_score_bernoulli = []
        for train_index, test_index in kf.split(x_df):
            score_mlt, score_bernoulli = build_mn(
                train = x_df.iloc[train_index,:], 
                test = x_df.iloc[test_index,:], 
                train_y = np.array(y_df.iloc[train_index,:]),
                test_y = np.array(y_df.iloc[test_index,:]),
                vectorizer = base_vectorizer
            )
            all_score_mlt.append(float(score_mlt))
            all_score_bernoulli.append(float(score_bernoulli))
        test_result['multinomial'][str(k_fold)] = np.average(all_score_mlt)
        test_result['bernoulli'][str(k_fold)] = np.average(all_score_bernoulli)
    return test_result

In [13]:
games_result = get_result(
    fold_list = [2,6,10],
    x_df = x_games_df,
    y_df = y_games_df
)
games_result

test on fold:  2
acc score -> bernoulli : 57.948717948717956 multinomial : 58.0448717948718
acc score -> bernoulli : 59.006410256410255 multinomial : 57.30769230769231
test on fold:  6
acc score -> bernoulli : 68.65384615384616 multinomial : 72.88461538461539
acc score -> bernoulli : 64.90384615384616 multinomial : 69.32692307692308
acc score -> bernoulli : 59.80769230769231 multinomial : 57.88461538461539
acc score -> bernoulli : 62.11538461538461 multinomial : 66.34615384615384
acc score -> bernoulli : 56.92307692307692 multinomial : 53.46153846153846
acc score -> bernoulli : 52.69230769230769 multinomial : 51.63461538461539
test on fold:  10
acc score -> bernoulli : 57.05128205128205 multinomial : 58.97435897435898
acc score -> bernoulli : 55.769230769230774 multinomial : 61.858974358974365
acc score -> bernoulli : 58.493589743589745 multinomial : 56.25
acc score -> bernoulli : 54.807692307692314 multinomial : 53.84615384615385
acc score -> bernoulli : 67.3076923076923 multinomial :

{'multinomial': {'2': 58.4775641025641,
  '6': 60.84935897435898,
  '10': 61.42628205128206},
 'bernoulli': {'2': 57.67628205128206,
  '6': 61.92307692307693,
  '10': 61.84294871794873}}