<div class="alert alert-success alertinfo" style="margin-top: 0px">
<h1>  Part 3. Data Modelling </h1>    
</div>

# 1. Imports

In [1]:
# Turning off warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Data Manipulation
import sys
import random
import pandas as pd
import numpy as np

# Visualization 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Import data
train = pd.read_csv(r"C:\Users\giuse\Desktop\job seeking\DATAscience_interview\train.csv")
test = pd.read_csv(r"C:\Users\giuse\Desktop\job seeking\DATAscience_interview\test.csv")

# 2. Data View

In [3]:
train.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,BET_AMOUNT_IN_CURRENCY,BET_AMOUNT_IN_EUR,BET_COUNT,USER_CODE,GAME_CODE,unique
0,1,BTC,Y,Slotomon Go,slots,enigmatic,1.5e-05,0.151416,1.0,0,2027,1
1,5,EUR,N,Starburst,slots,netent,36.1,36.1,10.0,1,2085,34
2,181,DOG,Y,Stellar Spins,slots,booming,13200.0,3.56349,169.0,2,2096,50
3,1939,BTC,Y,Show me the Mummy,slots,booming,0.00796,78.896458,518.0,3,1992,67
4,6784,BTC,Y,Wild Diamond 7x,slots,booming,0.00374,11.34615,187.0,4,2475,76


In [4]:
test.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,BET_AMOUNT_IN_CURRENCY,BET_AMOUNT_IN_EUR,BET_COUNT,USER_CODE,GAME_CODE,unique
0,5,BTC,Y,Bac Agin,card,asiagaming,0.024,75.11976,1.0,1,198,3
1,5,BTC,Y,Fire Lightning,slots,enigmatic,0.01386,46.762731,48.0,1,839,7
2,5,BTC,Y,Golden Tiger,slots,isoftbet,0.0003,2.835756,3.0,1,1027,9
3,5,BTC,Y,Immersive Roulette,roulette,evolution,0.01,112.2585,1.0,1,1168,10
4,5,BTC,Y,Local Pub,slots,belatra,0.0004,1.337264,4.0,1,1385,12


# 3. Data Modelling

## 3a Investigating bets and creating implied feedback

<font size="4">
    Recommendation systems are a great revenue driver, but are tricky to implement for iGaming. Normally when building recommender system we would take explicit feedback into account. Explicit feedback is data where we have some sort of rating. Like the 1 to 5 ratings from the Netflix dataset. Here we know how much a user likes or dislikes an item. For example, with star ratings we know that a 1 means the user did not like that item and a 5 that they really loved it. With igaming it might be that the user played a game and hated it, or loved it, or somewhere in-between. If they did not play a game it might be since they don’t like it or that they would love it if they just knew about it.</font> 
<br></br>
<br></br>
<font size="4"> 
    So here we are dealing with implicit feedback - which means it is more vague and not so easy to interpret. We will focus on what we do know, namely bet count and bet amount, and the confidence we have in whether or not they like any given game. We assume a higher confidence if they’ve placed many bets vs one time, we will also give more value to feedback if player has placed large bets. </font>  

In [5]:
def outliers_info(bounds_calculated_on, outlier_search_in, feature): 
    
    # defining bounds
    global lower_bound, upper_bound
    mean = bounds_calculated_on[feature].mean()       
    std = bounds_calculated_on[feature].std()           
    lower_bound = mean - (3 * std)
    upper_bound = mean + (3 * std)
    
    # searching for outliers     
    index_list_lower = outlier_search_in[(outlier_search_in[feature] <  lower_bound)].index
    index_list_upper = outlier_search_in[(outlier_search_in[feature] >  upper_bound)].index
    how_many_outliers_in_lower = len(index_list_lower)
    how_many_outliers_in_upper = len(index_list_upper)
    
    # creating a global list of outlier's idexes
    global outlier_indices
    outlier_indices =[]
    outlier_indices.extend(index_list_lower)
    outlier_indices.extend(index_list_upper)
    
    # getting name of data frame as a string
    global x
    name =[x for x in globals() if globals()[x] is outlier_search_in][0]    
      
    print("Outliers detected in {} for feature {}:".format(name,feature))
    print("_______________________________________________________________")    
    print("The lower bound value is {}. Number of outliers found in lower end: {}".format(lower_bound,how_many_outliers_in_lower))
    print("_______________________________________________________________")      
    print("The upper bound value is {}. Number of outliers found in upper end: {}".format(upper_bound,how_many_outliers_in_upper))
    print("_______________________________________________________________")
    print("The mean {} is {}".format(feature,mean))
    print("_______________________________________________________________")
    print("The standard deviation of {} is {}".format(feature,std))
    

### Bet count

In [6]:
feature = 'BET_COUNT'
new_feature = 'FEEDBACK_bet_count'
outliers_info(train, train, feature)

Outliers detected in train for feature BET_COUNT:
_______________________________________________________________
The lower bound value is -10112.380201656004. Number of outliers found in lower end: 0
_______________________________________________________________
The upper bound value is 11001.622576450762. Number of outliers found in upper end: 4039
_______________________________________________________________
The mean BET_COUNT is 444.62118739737923
_______________________________________________________________
The standard deviation of BET_COUNT is 3519.000463017794


In [7]:
average = train[feature].mean()
plus_one = train[feature].mean() +train[feature].std()
plus_two = train[feature].mean() + (2*train[feature].std())
plus_three = train[feature].mean() + (3*train[feature].std())

In [8]:
train[[new_feature]] = train[[feature]]
test[[new_feature]] = test[[feature]]

def feedback(row):           
    i = row[new_feature]
    if i > plus_three:
        return 5
    elif i > plus_two:
        return 4 
    elif i > plus_one:
        return 3      
    elif i > average:
        return 2
    elif i > 0.5*average:
        return 1
    elif i > 0:
        return -1   
    else:
        return 0

train[new_feature] = train.apply(feedback, axis=1)
train = train.drop(feature,axis=1)

test[new_feature] = test.apply(feedback, axis=1)
test = test.drop(feature,axis=1)

train.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,BET_AMOUNT_IN_CURRENCY,BET_AMOUNT_IN_EUR,USER_CODE,GAME_CODE,unique,FEEDBACK_bet_count
0,1,BTC,Y,Slotomon Go,slots,enigmatic,1.5e-05,0.151416,0,2027,1,-1
1,5,EUR,N,Starburst,slots,netent,36.1,36.1,1,2085,34,-1
2,181,DOG,Y,Stellar Spins,slots,booming,13200.0,3.56349,2,2096,50,-1
3,1939,BTC,Y,Show me the Mummy,slots,booming,0.00796,78.896458,3,1992,67,2
4,6784,BTC,Y,Wild Diamond 7x,slots,booming,0.00374,11.34615,4,2475,76,-1


### Bet amount in Euro

In [9]:
feature = 'BET_AMOUNT_IN_EUR'
new_feature = 'FEEDBACK_bet_amount_euro'
outliers_info(train, train, feature)

Outliers detected in train for feature BET_AMOUNT_IN_EUR:
_______________________________________________________________
The lower bound value is -73365.01035956226. Number of outliers found in lower end: 0
_______________________________________________________________
The upper bound value is 75062.89978560584. Number of outliers found in upper end: 961
_______________________________________________________________
The mean BET_AMOUNT_IN_EUR is 848.9447130217833
_______________________________________________________________
The standard deviation of BET_AMOUNT_IN_EUR is 24737.985024194684


In [10]:
average = train[feature].mean()
plus_one = train[feature].mean() +train[feature].std()
plus_two = train[feature].mean() + (2*train[feature].std())
plus_three = train[feature].mean() + (3*train[feature].std())

In [11]:
train[[new_feature]] = train[[feature]]
train[new_feature] = train.apply(feedback, axis=1)
train = train.drop(feature,axis=1)

test[[new_feature]] = test[[feature]]
test[new_feature] = test.apply(feedback, axis=1)
test = test.drop(feature,axis=1)

train.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,BET_AMOUNT_IN_CURRENCY,USER_CODE,GAME_CODE,unique,FEEDBACK_bet_count,FEEDBACK_bet_amount_euro
0,1,BTC,Y,Slotomon Go,slots,enigmatic,1.5e-05,0,2027,1,-1,-1
1,5,EUR,N,Starburst,slots,netent,36.1,1,2085,34,-1,-1
2,181,DOG,Y,Stellar Spins,slots,booming,13200.0,2,2096,50,-1,-1
3,1939,BTC,Y,Show me the Mummy,slots,booming,0.00796,3,1992,67,2,-1
4,6784,BTC,Y,Wild Diamond 7x,slots,booming,0.00374,4,2475,76,-1,-1


### Bet amount in Currency

In [12]:
feature = 'BET_AMOUNT_IN_CURRENCY'
new_feature = 'FEEDBACK_bet_amount_currency'
outliers_info(train, train, feature)

Outliers detected in train for feature BET_AMOUNT_IN_CURRENCY:
_______________________________________________________________
The lower bound value is -368768.61725547485. Number of outliers found in lower end: 0
_______________________________________________________________
The upper bound value is 376911.75627562095. Number of outliers found in upper end: 1024
_______________________________________________________________
The mean BET_AMOUNT_IN_CURRENCY is 4071.5695100730686
_______________________________________________________________
The standard deviation of BET_AMOUNT_IN_CURRENCY is 124280.06225518264


In [13]:
average = train[feature].mean()
plus_one = train[feature].mean() +train[feature].std()
plus_two = train[feature].mean() + (2*train[feature].std())
plus_three = train[feature].mean() + (3*train[feature].std())

In [14]:
train[[new_feature]] = train[[feature]]
train[new_feature] = train.apply(feedback, axis=1)
train = train.drop(feature,axis=1)

test[[new_feature]] = test[[feature]]
test[new_feature] = test.apply(feedback, axis=1)
test = test.drop(feature,axis=1)

train.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,USER_CODE,GAME_CODE,unique,FEEDBACK_bet_count,FEEDBACK_bet_amount_euro,FEEDBACK_bet_amount_currency
0,1,BTC,Y,Slotomon Go,slots,enigmatic,0,2027,1,-1,-1,-1
1,5,EUR,N,Starburst,slots,netent,1,2085,34,-1,-1,-1
2,181,DOG,Y,Stellar Spins,slots,booming,2,2096,50,-1,-1,2
3,1939,BTC,Y,Show me the Mummy,slots,booming,3,1992,67,2,-1,-1
4,6784,BTC,Y,Wild Diamond 7x,slots,booming,4,2475,76,-1,-1,-1


### Combining the feedback

In [15]:
def feedback(row):
    bet_count = row['FEEDBACK_bet_count']
    euro = row['FEEDBACK_bet_amount_euro']
    currency = row ['FEEDBACK_bet_amount_currency']
    row['FEEDBACK'] = max([bet_count, euro, currency])
    return row

train_df = train.apply(feedback, axis=1)
train = train_df

test_df = test.apply(feedback, axis=1)
test = test_df

In [16]:
train.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,USER_CODE,GAME_CODE,unique,FEEDBACK_bet_count,FEEDBACK_bet_amount_euro,FEEDBACK_bet_amount_currency,FEEDBACK
0,1,BTC,Y,Slotomon Go,slots,enigmatic,0,2027,1,-1,-1,-1,-1
1,5,EUR,N,Starburst,slots,netent,1,2085,34,-1,-1,-1,-1
2,181,DOG,Y,Stellar Spins,slots,booming,2,2096,50,-1,-1,2,2
3,1939,BTC,Y,Show me the Mummy,slots,booming,3,1992,67,2,-1,-1,2
4,6784,BTC,Y,Wild Diamond 7x,slots,booming,4,2475,76,-1,-1,-1,-1


In [17]:
test.head()

Unnamed: 0,USER_ID,CURRENCY,CURRENCY_IS_CRYPTO,GAME_TITLE,GAME_TYPE,GAME_PROVIDER,USER_CODE,GAME_CODE,unique,FEEDBACK_bet_count,FEEDBACK_bet_amount_euro,FEEDBACK_bet_amount_currency,FEEDBACK
0,5,BTC,Y,Bac Agin,card,asiagaming,1,198,3,-1,-1,-1,-1
1,5,BTC,Y,Fire Lightning,slots,enigmatic,1,839,7,-1,-1,-1,-1
2,5,BTC,Y,Golden Tiger,slots,isoftbet,1,1027,9,-1,-1,-1,-1
3,5,BTC,Y,Immersive Roulette,roulette,evolution,1,1168,10,-1,-1,-1,-1
4,5,BTC,Y,Local Pub,slots,belatra,1,1385,12,-1,-1,-1,-1


In [18]:
# Saving data
train.to_csv('train_modelled.csv', index=False)
test.to_csv('test_modelled.csv', index=False)