KABAM Assignment (Data Scientist Position)

In [None]:
import numpy as np
import pandas as pd
from sqlite3 import connect
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import models
from tensorflow.keras.layers import LSTM, Dense, Dropout, GRU, Bidirectional, Embedding, RepeatVector, TimeDistributed, Flatten
import time
import keras
import xgboost as xgb
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV, LassoLarsCV, Lasso, LassoCV, RidgeCV, Ridge, LarsCV, ElasticNet
from keras.callbacks import Callback
from keras.models import load_model
import lazypredict
from lazypredict.Supervised import LazyClassifier
from lazypredict.Supervised import LazyRegressor

In [None]:
# Parameters

# Two approaches: classification or Regression. Just one should be True
classification_mode = True
regression_mode = False

# if True then downsampling will be done for train data
down_sample = False
# oversampling can be False, 'random_oversample' or 'smote_oversample'
up_sample = 'smote_oversample'
# number of classes for classification. 2, 3 or 5 classes for now.
n_class = 3

if classification_mode:
    # int as n_components for PCA or False for no PCA
    PCA_apply = False
    # drop categorical columns
    drop_categorical_cols = False
    # if True then label with value of 0 will be removed
    drop_zero_calss = True

if regression_mode:
    # int as n_components for PCA or False for no PCA
    PCA_apply = False
    # drop categorical columns
    drop_categorical_cols = False
    # if True then label with value of 0 will be removed
    drop_zero_calss = True

start = time.time()

In [None]:
def df_index(df, index, droped_columns): 
    """
    This function takes a dataframe and returns a new dataframe with the specified index
    and remove droped columns.

    Parameters:
    -----------
    df: pandas dataframe
        The dataframe to be indexed.
    index: string
        The name of the index column.
    droped_columns: list of strings
        The list of columns to be dropped.

    Returns: pandas dataframe
    """
    df.index = df[index]
    df.drop(columns=droped_columns, inplace=True)
    return df

Load Data

In [None]:
# Load ka_actions as dataframe
# place datasets into kabam_ds_interview folder or change the path
ka_actions = pd.read_parquet('./kabam_ds_interview/ka_actions.parquet')
ka_actions = df_index(ka_actions, 'uid_s', ['uid_s'])
print(ka_actions.shape)
print("Number of Nans for each column: \n",ka_actions.isna().sum())
ka_actions.head()

In [None]:
# Load ka_users as dataframe
ka_users = pd.read_csv('./kabam_ds_interview/ka_users.csv')
ka_users = df_index(ka_users, 'uid_s', ['uid_s'])
print(ka_users.shape)
print("Number of Nans for each column: \n", ka_users.isna().sum())
ka_users.head()

In [None]:
# Chech the name of the tables in ka_devices database.
ka_devices_db = connect('./kabam_ds_interview/ka_devices.db')
ka_devices_name = pd.read_sql_query("SELECT name as table_name FROM sqlite_schema WHERE type ='table' AND name NOT LIKE 'sqlite_%';", ka_devices_db)
ka_devices_name

In [None]:
# Load ka_devices as dataframe
ka_devices = pd.read_sql_query("SELECT * FROM devices", ka_devices_db)
ka_devices = df_index(ka_devices, 'uid_s', ['uid_s', 'index'])
print(ka_devices.shape)
print("Number of Nans for each column: \n", ka_devices.isna().sum())
ka_devices.head()

In [None]:
# Describe all dataframes
for df in [ka_devices, ka_actions, ka_users]:
    print(f"Describe dataframe: \n", df.describe())
    print(df.shape)
    print("********"*12)

In [None]:
# Join all dataframes to have one dataframe with all data
df = ka_users.join(ka_devices, on=ka_users.index).join(ka_actions, on=ka_actions.index)
print(df.columns)
# print((df[df['total_spend']>0].shape[0] / df[df['total_spend']==0].shape[0])*100)
df

In [None]:
# how many of each value for game_stats_tutorial_complete
unique_game_stats_tutorial_complete = df.game_stats_tutorial_complete.unique()
print(unique_game_stats_tutorial_complete)

print(f"Number of game_stats_tutorial_complete with value of 1: ",
                                df[df.game_stats_tutorial_complete == 1.].shape[0])
print(f"Number of game_stats_tutorial_complete with value of 0: ",
                                df[df.game_stats_tutorial_complete == 0.].shape[0])
print(f"Number of game_stats_tutorial_complete with value of Nan: ",
                                df.game_stats_tutorial_complete.isna().sum())

In [None]:
experiences = [col for col in df if col.startswith('game_stats_xp')]
for exp in experiences:
    print(f"Number of {exp} with value of Nan: ",
                                df[df[exp].isna()].shape[0])

Preprocessing

In [None]:
# create and describe dataframe with only tutorial complete is 1,
# since we are just interested in people who finished tutorial
df_completed_tutorial = df[df.game_stats_tutorial_complete == 1.]
df_completed_tutorial.describe()

In [None]:
print(df.total_spend.describe())
print(df_completed_tutorial.total_spend.describe())

In [None]:
# plt.figure(figsize=(20, 8))
# plt.title('Total spend')
# plt.ylabel('Total spend')
# plt.bar(np.arange(0,df_completed_tutorial.shape[0]), df_completed_tutorial['total_spend'].values)

In [None]:
# df_completed_tutorial.plot(x='total_spend', y=['game_stats_xp', 'game_stats_xp1',
#        'game_stats_xp2', 'game_stats_xp3',], kind="bar")

In [None]:
# Drop duplicates from dataframe
df_completed_tutorial.drop_duplicates(inplace=True)
df_completed_tutorial

In [None]:
# Drop Nan values from dataframe
print("Number of Nans for each column just with completed tutorial: \n", df_completed_tutorial.isna().sum())
df_completed_tutorial.dropna(inplace=True)
print("Number of Nans for each column just with completed tutorial: \n", df_completed_tutorial.isna().sum())
# df_completed_tutorial.to_csv('./kabam_ds_interview/df.csv')

In [None]:
# Correlation matrix
corrMatrix = df_completed_tutorial.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
# Very correated features like device_mem_i can be removed.
print((corrMatrix>0.9).sum())
# df_completed_tutorial.drop(columns=['device_mem_i', 'device_gmem_i',
#                                     'device_mem_grouping_i', 'device_gmem_grouping_i'], inplace=True)
df_completed_tutorial.columns

In [None]:
# Select columns with string value and change them to categorical type for pandas
categorical_columns = []
for col in df_completed_tutorial.columns:
    #print(col, df_completed_tutorial[col].dtype.name)
    if df_completed_tutorial[col].dtype.name == 'object':
        df_completed_tutorial[col] = df_completed_tutorial[col].astype('category')
        categorical_columns.append(col)
        
print("categorical_columns: \n", categorical_columns)

In [None]:
# Change categorical columns to numerical
for col in categorical_columns:
    codes, unique = pd.factorize(df_completed_tutorial[col].values, sort=False, na_sentinel=- 1, size_hint=None)
    df_completed_tutorial[col] = codes
df_completed_tutorial

In [None]:
# Drop game_stats_tutorial_complete since it is all 1.
df_completed_tutorial.drop('game_stats_tutorial_complete', axis=1, inplace=True)

In [None]:
# Create classes based on total_spend column for classification mode
# For regression, the continues values of total_spend will be used.
if classification_mode:
    if n_class == 5:
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] <= 0, 'total_spend'] = 0
        df_completed_tutorial['total_spend'] = np.where(df_completed_tutorial['total_spend'].between(1,100), 1,
                                    df_completed_tutorial['total_spend'])
        df_completed_tutorial['total_spend'] = np.where(df_completed_tutorial['total_spend'].between(101,200), 2,
                                    df_completed_tutorial['total_spend'])
        df_completed_tutorial['total_spend'] = np.where(df_completed_tutorial['total_spend'].between(201,300), 3,
                                    df_completed_tutorial['total_spend'])
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] > 300, 'total_spend'] = 4

    if n_class == 3:
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] <= 0, 'total_spend'] = 0
        df_completed_tutorial['total_spend'] = np.where(df_completed_tutorial['total_spend'].between(1,250), 1,
                                    df_completed_tutorial['total_spend'])
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] > 250, 'total_spend'] = 2

    else:
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] <= 0, 'total_spend'] = 0
        df_completed_tutorial.loc[df_completed_tutorial['total_spend'] > 0, 'total_spend'] = 1
        
    df_completed_tutorial['total_spend']

In [None]:
def data_splitter(df_completed_tutorial, random_state, test_size=0.2):
    """
    Splits the data into training and test sets.

    Parameters:
    df_completed_tutorial (dataframe): pd.DataFrame
        dataframe with all the data
    test_size (float):
        proportion of data to be used for testing
    
    Returns:
        X_train, X_test, y_train, y_test (pd.dataframe for x and pd.Series for y)
    """
    X_train, X_test, y_train, y_test = train_test_split(
                        df_completed_tutorial.loc[:, df_completed_tutorial.columns != 'total_spend'],
                        df_completed_tutorial['total_spend'], test_size=test_size, random_state=random_state, shuffle=True)

    print("X_train.shape: ", X_train.shape)
    print("X_test.shape: ", X_test.shape)
    print("y_train.shape: ", y_train.shape)
    print("y_test.shape: ", y_test.shape)

    try:
        print("Confirm changes are the same after shuffling: ",
                                        df.loc[1266806]["game_stats_xp2"] == X_train.loc[1266806]["game_stats_xp2"])
    except ValueError:
        print("Specific selected Column is removed.")

    return X_train, X_test, y_train, y_test

# Split dataframe into train and test
X_train, X_test, y_train, y_test = data_splitter(df_completed_tutorial, random_state=42, test_size=0.2)
X_train

In [None]:
# Feature Selection with random forest based on feature importance
model = RandomForestRegressor(n_estimators=100, random_state=1)
model.fit(X_train.values[-10000:,:], y_train.values[-10000:])

# Show importance scores
print("Feature importance: ", model.feature_importances_)
names = X_train.columns
ticks = [i for i in range(len(names))]

# Sort features
order_features = {}
selected_features = []
for name, value in zip(names, ticks):
        order_features[name] = model.feature_importances_[value]
sorted_features = {k: v for k, v in sorted(order_features.items(), key=lambda item: item[1])}
print("sorted_features: ", sorted_features)

# Select most important features
for k, v in sorted_features.items():
        if v >= 0.01:
                selected_features.append(k)
print('Candidate features: ', selected_features)

selected_features.append('total_spend')
df_completed_tutorial = df_completed_tutorial[selected_features]
df_completed_tutorial.columns

In [None]:
# Split dataframe into train and test with selected features
X_train, X_test, y_train, y_test = data_splitter(df_completed_tutorial, random_state=45, test_size=0.2)
X_train

In [None]:
# Normalize the data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train[X_train.columns] = scaler.transform(X_train)
X_test[X_test.columns] = scaler.transform(X_test)

if regression_mode:
    scaler = MinMaxScaler()
    scaler.fit(y_train.values.reshape(-1,1))
    y_tr = scaler.transform(y_train.values.reshape(-1,1))
    y_te = scaler.transform(y_test.values.reshape(-1,1))
    y_train = pd.Series(data=y_tr.ravel(), index=y_train.index)
    y_test = pd.Series(data=y_te.ravel(), index=y_test.index)
X_train

In [None]:
plt.figure(figsize=(20, 8))
#plt.plot(X_train['game_stats_tutorial_complete_time'][:100000])
plt.title('Total spend y_test')
plt.ylabel('Total spend')
plt.hist(y_test, bins=20)

In [None]:
# Very imbalanced data. Mostly (around 98%) with 0 value (does not spend at all)
print((y_train[y_train>0].shape[0] / y_train[y_train==0].shape[0])*100)

In [None]:
# Oversampling, downsampling, PCA, drop categorical columns and drop zero class
if classification_mode:
    if down_sample:
        # class_1_above = []
        # for i in y_train.unique():
        #     print(f"Class {i}: ", y_train[y_train == i].shape[0])
        #     if i != 0:
        #         class_1_above.append(y_train[y_train == i].shape[0])
        # avg_number = np.ceil(np.mean(class_1_above))
        # removal_numbers = int(y_train[y_train == 0].shape[0] - avg_number)
        # index_removal = y_train[y_train == 0].sample(n=removal_numbers, replace=False).index
        # y_train.drop(index_removal, inplace=True)
        # X_train.drop(index_removal, inplace=True)
        undersample = RandomUnderSampler(sampling_strategy="not minority")
        # fit and apply the transform
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        print("Downsampling is done.")

    if up_sample:
        if up_sample == "random_oversample":
            oversample = RandomOverSampler(sampling_strategy="not majority")
        if up_sample == "smote_oversample":
            oversample = SMOTE(sampling_strategy="not majority")
        # fit and apply the transform
        X_train, y_train = oversample.fit_resample(X_train, y_train)
        print("Oversamling is done.")

if PCA_apply:
    pca = PCA(n_components=PCA_apply)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    print("PCA done.")

if drop_categorical_cols:
    try:
        X_train = X_train.drop(columns=categorical_columns)
        X_test = X_test.drop(columns=categorical_columns)
        print("Categorical columns dropped.")
    except ValueError:
        print("No categorical columns to drop. Check feature selection removed features.")

if drop_zero_calss:
    if n_class >2 and classification_mode:
        pass
    else:
        X_train = X_train[y_train != 0]
        y_train = y_train[y_train != 0]
        X_test = X_test[y_test != 0]
        y_test = y_test[y_test != 0]
        print("Zero class removed.")

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)
print("X_test shape: ", X_test.shape)
print("y_test shape: ", y_test.shape)
    

In [None]:
# how manu values are there in each class
for i in y_train.unique():
    print(f"Class {i}: ", y_train[y_train == i].shape[0])
print(y_train.unique())

In [None]:
def plot_results(y_test, y_pred):
    """
    Plots the results of the model.

    Parameters:
    ----------
    y_test: numpy ndarray
        The actual values of the test set.
    y_pred: numpy ndarray
        The predicted values of the test set.
    """
    plt.figure(figsize=(20, 8))
    # plt.scatter(np.arange(y_test.shape[0]), y_test, label='Actual')
    # plt.scatter(np.arange(y_pred.shape[0]), y_pred, label='Predicted')
    plt.hist(y_test, bins=20, label='Actual')
    plt.hist(y_pred, bins=20, label='Predicted')
    plt.legend()
    plt.title('Predicted vs Actual')
    plt.ylabel('Total spend')
    plt.show()

def regression_repot(y_test, y_pred):
    """
    compute errors for regression and plot results.

    Parameters:
    ----------
    y_test: numpy ndarray
        The actual values of the test set.
    y_pred: numpy ndarray
        The predicted values of the test set.
    """
    y_test_reverse = scaler.inverse_transform(y_test.values.reshape(-1,1))
    y_pred_reverse = scaler.inverse_transform(y_pred.reshape(-1,1))
    print(mean_squared_error(y_test_reverse, y_pred_reverse))
    print(mean_absolute_error(y_test_reverse, y_pred_reverse))
    plot_results(y_test_reverse, y_pred_reverse)

In [None]:
# Save best model for LSTM and Early stopping
class TerminateOnBaseline(Callback):
    """Callback that terminates training when either acc or val_acc reaches a specified baseline
    """
    def __init__(self, monitor='val_loss', baseline=0.001):
        super(TerminateOnBaseline, self).__init__()
        self.monitor = monitor
        self.baseline = baseline

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        acc = logs.get(self.monitor)
        if acc is not None:
            if acc <= self.baseline:
                print('Epoch %d: Reached baseline, terminating training' % (epoch))
                self.model.stop_training = True
save_best = keras.callbacks.ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True)

ML and DL Train and Test

In [None]:
if regression_mode:
    elcv = ElasticNetCV(cv=5)
    elcv.fit(X_train, y_train)
    y_pred = elcv.predict(X_test)
    regression_repot(y_test, y_pred)
    
    print("XGBoost")
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dtest = xgb.DMatrix(data=X_test, label=y_test)
    params = {'subsample': 0.1,
            'colsample_bytree': 0.1,
            'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'max_depth': 50,
            'silent': 1,
            'learning_rate': 0.1,
            'nthread': 4,
            'n_estimators': 3000,
            }
    bst = xgb.train(params, dtrain, 2)
    y_pred = bst.predict(dtest)
    regression_repot(y_test, y_pred)

    print("Linear Regression")
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    regression_repot(y_test, y_pred)

    print("******"*12)
    print("Kernel Ridge")
    krr = KernelRidge(alpha=1.0)
    krr.fit(X_train, y_train)
    y_pred = krr.predict(X_test)
    regression_repot(y_test, y_pred)

    print("******"*12)
    reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
    models, predictions = reg.fit(X_train, X_test, y_train, y_test)
    print(models)
    print("*******"*12)

In [None]:
#LSTM model
if regression_mode:
    number_of_epochs = 5
    batch_size = 128
    validation_split = 0.2
    x_train = np.expand_dims(X_train.values, axis=2)
    x_test = np.expand_dims(X_test.values, axis=2)

    model = Sequential()
    model.add(LSTM(256, input_shape=(x_train.shape[1],x_train.shape[2]), activation='relu', return_sequences = False))
    model.add(Dense(1))
    model.compile(optimizer='sgd', loss='mse', metrics=['mae'])
    model.summary()
    history = model.fit(x_train, y_train, batch_size=batch_size, epochs=number_of_epochs, shuffle=True, verbose=1,
                        validation_split=validation_split,
                        callbacks=[TerminateOnBaseline(monitor='val_loss', baseline=0.0005), save_best]).history

    model = load_model('./model.h5')
    y_pred = model.predict(x_test)
    regression_repot(y_test, y_pred)

In [36]:
if classification_mode:
    print("KNN")
    knn = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train.values.reshape(-1, 1))
    y_pred = knn.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    print("******"*12)
    print("XGboost model")
    dtrain = xgb.DMatrix(data=X_train, label=y_train)
    dtest = xgb.DMatrix(data=X_test, label=y_test)
    params = {'subsample': 0.1,
            'colsample_bytree': 0.1,
            'objective': 'reg:linear',
            'eval_metric': 'rmse',
            'max_depth': 50,
            'silent': 1,
            'learning_rate': 0.1,
            'nthread': 4,
            'n_estimators': 3000,
            }
    bst = xgb.train(params, dtrain, 2)
    y_pred = bst.predict(dtest)
    y_pred = np.where(y_pred <= 0.5, 0, 1)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    print("******"*12)
    print("LSTM model")
    # Model configuration
    additional_metrics = ['accuracy']
    batch_size = 128
    loss_function = CategoricalCrossentropy()
    number_of_epochs = 5
    optimizer = Adam()
    validation_split = 0.2
    verbosity_mode = 1
    y_train_cat = tf.keras.utils.to_categorical(y_train)
    y_test_cat = tf.keras.utils.to_categorical(y_test)
    # print(y_train.shape)
    # Define the Keras model
    model = Sequential()
    #model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
    # model.add(Embedding(50000, 8, input_length = X_train.shape[1]))
    model.add(LSTM(256, return_sequences=True))
    model.add(LSTM(32, activation ='relu'))
    #model.add(Dropout(0.2))
    #model.add(Dense(128, activation='relu'))
    model.add(Dense(y_train_cat.shape[1], activation='softmax'))
    # Compile the model
    model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)
    # Train the model
    history = model.fit(np.expand_dims(X_train,2), y_train_cat, batch_size=batch_size,
                epochs=number_of_epochs, verbose=verbosity_mode, validation_split=validation_split, shuffle=True,
                callbacks=[TerminateOnBaseline(monitor='val_loss', baseline=0.0005), save_best]).history
    # Give a summary
    model.summary()
    # Test the model after training
    model = load_model('./model.h5')
    test_results = model.evaluate(np.expand_dims(X_test, 2), y_test_cat, verbose=False)
    print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_test_cat.argmax(axis=1), y_pred.argmax(axis=1)))
    print(classification_report(y_test_cat.argmax(axis=1), y_pred.argmax(axis=1)))
    
    print("******"*12)
    clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train, X_test, y_train, y_test)
    print(models)


In [None]:
print(f'Time taken: {time.time() - start} seconds')