In [234]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [235]:
df = pd.read_csv('dataset_binary_games.csv')
df = shuffle(df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,is_games
0,1103702902,Red 7 - play Digital Red7 Card Game with Friends,2.99,4,games,1,red official digital mobile version cult card ...,1
1,337056601,n-tv Nachrichten,0.00,12,news,1,die die das den touch das apple die apple watc...,0
2,1165391697,Crazy Clown Chase,0.00,9,games,1,tap carefully bounce way clown clown craze swe...,1
3,866786833,iThoughts (mindmap),11.99,17,productivity,1,tool touch mac version also visually sure righ...,0
4,908456072,Fireboy & Watergirl 2 - The Forest Temple,0.99,4,games,1,fireboy unique game best puzzle game never mix...,1
...,...,...,...,...,...,...,...,...
6235,1100147750,Trailer Truck Parking with Real City Traffic C...,0.00,4,games,1,master every kind parking challenge huge truck...,1
6236,460712294,Kids Doodle - Movie Kids Color & Draw,0.00,4,games,1,doodle pretty painting application designed sp...,1
6237,1101848745,Runaway Toad,2.99,4,games,1,easy green tap hold swipe help toad hop safety...,1
6238,1123579511,Magic Nightfall,0.00,4,games,1,come help princess get back kingdom rescue cut...,1


In [236]:
x_df = df.iloc[:, [2,3,5,6]]
y_df = df.iloc[:, [7]]

In [237]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [238]:
base_vectorizer = tf_idf(df.app_desc, max_feat=600)

In [239]:
kf = KFold(n_splits=10)

In [240]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [241]:
def build_mn(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score_mlt = accuracy_score(predictions_mnb, test_y)*100
    print("mnb acc score -> ",score_mlt)
    bernoulli = BernoulliNB()
    bernoulli.fit(train_x, train_y.ravel())
    predictions_bernoulli = bernoulli.predict(test_x)
    score_bernoulli = accuracy_score(predictions_bernoulli, test_y)*100
    print("bernoulli acc score -> ",score_bernoulli)
    return score_mlt, score_bernoulli

In [242]:
result_multinomial = []
result_bernoulli = []
for train_index, test_index in kf.split(df):
    score_mlt, score_bernoulli = build_mn(
        train = x_df.iloc[train_index,:], 
        test = x_df.iloc[test_index,:], 
        train_y = np.array(y_df.iloc[train_index,:]),
        test_y = np.array(y_df.iloc[test_index,:]),
        vectorizer = base_vectorizer
    )
    result_multinomial.append(float(score_mlt))
    result_bernoulli.append(float(score_bernoulli))

mnb acc score ->  57.05128205128205
bernoulli acc score ->  58.97435897435898
mnb acc score ->  55.769230769230774
bernoulli acc score ->  61.858974358974365
mnb acc score ->  58.493589743589745
bernoulli acc score ->  56.25
mnb acc score ->  54.807692307692314
bernoulli acc score ->  53.84615384615385
mnb acc score ->  67.3076923076923
bernoulli acc score ->  66.82692307692307
mnb acc score ->  68.42948717948718
bernoulli acc score ->  68.26923076923077
mnb acc score ->  59.294871794871796
bernoulli acc score ->  58.01282051282052
mnb acc score ->  62.33974358974359
bernoulli acc score ->  61.69871794871795
mnb acc score ->  68.10897435897436
bernoulli acc score ->  69.71153846153845
mnb acc score ->  62.66025641025641
bernoulli acc score ->  62.980769230769226


In [243]:
avg_score_mlt = np.average(result_multinomial)
avg_score_mlt

61.42628205128206

In [244]:
avg_score_bernoulli = np.average(result_bernoulli)
avg_score_bernoulli

61.84294871794873