In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [31]:
df = pd.read_csv('dataset_binary_games.csv')
df = shuffle(df).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,id,track_name,price,cont_rating,prime_genre,vpp_lic,app_desc,is_games
0,1077056423,Danse Macabre: Thin Ice - A Mystery Hidden Obj...,6.99,9,games,1,pay play forever fisher star popular figure sk...,1
1,1028362533,Slots - Huuuge Casino: Slot Machines,0.00,12,games,1,join world biggest casino community casino pla...,1
2,412395632,"乐视视频HD-白鹿原,欢乐颂,奔跑吧全网热播",0.00,17,entertainment,1,f f live f,0
3,480375355,The Bard's Tale,2.99,12,games,1,sale price low even crazy crazy get come canad...,1
4,1054501757,Lifeline: Silent Night,1.99,9,games,1,hearts countless original lifeline took store ...,1
...,...,...,...,...,...,...,...,...
6235,1060082140,Flick Golf Extreme,0.00,4,games,1,flick golf take extreme ready master world tee...,1
6236,550902799,Camera+ for iPad,4.99,4,photo & video,1,clarity filter photography secret sauce crispn...,0
6237,996246479,SHOWTIME,0.00,17,entertainment,1,limited time offer try days free start streami...,0
6238,1121740066,PetMOJI – Character Creator & Emoji Keyboard,0.00,4,entertainment,1,universal illumination entertainment behind se...,0


In [65]:
x_df = df.iloc[:, [2,3,5,6]]
y_df = df.iloc[:, [7]]

In [68]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [69]:
base_vectorizer = tf_idf(df.app_desc, max_feat=600)

In [163]:
kf = KFold(n_splits=10)

In [164]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [173]:
def build_mn_naive_bayes(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score = accuracy_score(predictions_mnb, test_y)*100
    print("mnb acc score -> ",score)
    return score

In [177]:
score_mn = []
for train_index, test_index in kf.split(df):
    score_mn_naive_bayes = build_mn_naive_bayes(
        train = x_df.iloc[train_index,:], 
        test = x_df.iloc[test_index,:], 
        train_y = np.array(y_df.iloc[train_index,:]),
        test_y = np.array(y_df.iloc[test_index,:]),
        vectorizer = base_vectorizer
    )
    score_mn.append(float(score_mn_naive_bayes))

mnb acc score ->  60.73717948717948
mnb acc score ->  58.81410256410257
mnb acc score ->  59.455128205128204
mnb acc score ->  60.09615384615385
mnb acc score ->  60.256410256410255
mnb acc score ->  62.980769230769226
mnb acc score ->  57.692307692307686
mnb acc score ->  60.89743589743589
mnb acc score ->  55.608974358974365
mnb acc score ->  67.78846153846155


In [178]:
avg_mn_naive_bayes = np.average(score_mn)
avg_mn_naive_bayes

60.4326923076923