In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

In [None]:
games_df = pd.read_csv('dataset_binary_games.csv')
games_df = shuffle(games_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
games_df

In [None]:
x_games_df = games_df.iloc[:, [2,3,5,6]]
y_games_df = games_df.iloc[:, [7]]

In [None]:
def tf_idf(text, max_feat=None):
    tfidf_vect = TfidfVectorizer(max_features=max_feat)
    tfidf_vect.fit(text)
    return tfidf_vect

In [None]:
base_vectorizer = tf_idf(games_df.app_desc, max_feat=24)

In [None]:
def join_feature(feature, tfidf):
    features = []
    transform = np.array(feature.iloc[:, [0,1,2]])
    for i,_ in enumerate(transform):
        concat = np.concatenate(
            [transform[i].reshape(-1), 
             tfidf[i].reshape(-1)]
        )
        features.append(concat)
    return np.array(features)

In [None]:
def build_ml(train, test, train_y, test_y, vectorizer):
    train_x_tfidf = vectorizer.fit_transform(train.app_desc).toarray()
    test_x_tfidf = vectorizer.fit_transform(test.app_desc).toarray()
    train_x = join_feature(train, train_x_tfidf)
    test_x = join_feature(test, test_x_tfidf)
    mnb = MultinomialNB()
    mnb.fit(train_x, train_y.ravel())
    predictions_mnb = mnb.predict(test_x)
    score_mlt = accuracy_score(predictions_mnb, test_y)*100
    bernoulli = BernoulliNB()
    bernoulli.fit(train_x, train_y.ravel())
    predictions_bernoulli = bernoulli.predict(test_x)
    score_bernoulli = accuracy_score(predictions_bernoulli, test_y)*100
    print("acc score -> bernoulli : {} multinomial : {}".format(score_bernoulli, score_mlt))
    return score_mlt, score_bernoulli

In [None]:
def get_result(fold_list, x_df, y_df):
    test_result = {'multinomial':{},'bernoulli':{}}
    for k_fold in fold_list:
        kf = KFold(n_splits=k_fold)
        print("test on fold: ", k_fold)
        all_score_mlt = []
        all_score_bernoulli = []
        for train_index, test_index in kf.split(x_df):
            score_mlt, score_bernoulli = build_ml(
                train = x_df.iloc[train_index,:], 
                test = x_df.iloc[test_index,:], 
                train_y = np.array(y_df.iloc[train_index,:]),
                test_y = np.array(y_df.iloc[test_index,:]),
                vectorizer = base_vectorizer
            )
            all_score_mlt.append(score_mlt)
            all_score_bernoulli.append(score_bernoulli)
        test_result['multinomial'][str(k_fold)] = np.average(all_score_mlt)
        test_result['bernoulli'][str(k_fold)] = np.average(all_score_bernoulli)
    return test_result

In [None]:
games_result = get_result(
    fold_list = [2,6,10],
    x_df = x_games_df,
    y_df = y_games_df
)
games_result

In [None]:
all_df = pd.read_csv('dataset_all_category.csv')
all_df = shuffle(all_df, random_state = 42).reset_index(drop=True).drop(['Unnamed: 0'], axis=1)
all_df

In [None]:
genre_list = list(set(all_df['prime_genre'].to_list()))
genre_list

In [None]:
le = preprocessing.LabelEncoder()
le.fit(genre_list)

In [None]:
def encode_class(str_class):
    return le.transform([str_class])[0]

In [None]:
all_df['encode_class'] = all_df['prime_genre'].apply(encode_class)
all_df

In [None]:
x_all_df = all_df.iloc[:, [2,3,5,6]]
y_all_df = all_df.iloc[:, [7]]

In [None]:
all_result = get_result(
    fold_list = [2,6,10],
    x_df = x_all_df,
    y_df = y_all_df
)
all_result