# Importing the Data

In [1]:
import re

# to handle datasets
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt

# to divide train and test set
from sklearn.model_selection import train_test_split

# feature scaling
from sklearn.preprocessing import StandardScaler

# to build the models
from sklearn.linear_model import LogisticRegression

# to evaluate the models
from sklearn.metrics import accuracy_score, roc_auc_score

# to persist the model and the scaler
import joblib

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [6]:
# load the csvs - it is available open source and online

iaps = pd.read_csv('iaps.csv')
sessions = pd.read_csv('sessions.csv')
spendevents = pd.read_csv('spendevents.csv')
users = pd.read_csv('users.csv')

# Exploratory Analysis

## IAPS Analysis

In [12]:
# display data
iaps.head(100)

Unnamed: 0,user_id,ts,date,prod_name,prod_type,rev
0,7480,2019-03-04 08:15:49,2019-03-04,iap_1_gems_2,gems,760
1,7480,2019-03-04 08:24:15,2019-03-04,iap_1_passes_2,chapterPasses,760
2,7480,2019-03-04 22:49:08,2019-03-04,iap_1_gems_1,gems,410
3,2466,2019-03-06 00:16:48,2019-03-06,iap_1_gems_2,gems,760
4,22001,2019-03-06 09:13:45,2019-03-06,iap_1_gems_2,gems,760
...,...,...,...,...,...,...
95,16090,2019-03-02 10:03:17,2019-03-02,iap_1_passes_1,chapterPasses,410
96,16090,2019-03-02 10:13:11,2019-03-02,iap_1_passes_2,chapterPasses,760
97,16090,2019-03-02 10:45:18,2019-03-02,iap_1_gems_2,gems,760
98,16090,2019-03-02 10:51:33,2019-03-02,iap_1_passes_1,chapterPasses,410


In [20]:
iaps['prod_name'].value_counts()

iap_1_gems_2          2599
iap_1_passes_2        1064
iap_1_gems_5           976
iap_1_gems_1           693
iap_1_passes_1         576
iap_1_gems_10          231
iap_1_passes_5         226
iap_1_passes_10        163
iap_1_gems_20           69
iap_value_pack_004      46
iap_value_pack_008      13
iap_value_pack_001      12
iap_value_pack_005       5
iap_1_gems_50            3
iap_value_pack_010       3
iap_value_pack_003       2
iap_1_gems_100           2
iap_value_pack_007       1
iap_value_pack_009       1
Name: prod_name, dtype: int64

In [21]:
# Purchase data seems to be from March to May in 2019
iaps['date'].value_counts()

2019-03-06    873
2019-03-07    766
2019-03-05    653
2019-03-08    451
2019-03-04    433
             ... 
2019-05-05      6
2019-05-02      4
2019-05-03      4
2019-05-04      3
2019-05-06      1
Name: date, Length: 67, dtype: int64

In [7]:
# display data
sessions.head()

Unnamed: 0,user_id,ts,date,session_num,last_session_termination_type
0,14067,2019-03-01 00:06:50,2019-03-01,1,
1,14067,2019-03-01 00:22:27,2019-03-01,2,
2,16275,2019-03-01 01:23:03,2019-03-01,1,
3,16275,2019-03-01 01:31:16,2019-03-01,2,
4,16275,2019-03-01 01:47:22,2019-03-01,3,


In [8]:
# display data
spendevents.head()

Unnamed: 0,user_id,ts,date,story,chapter,spendtype,currency,amount
0,9829,2019-03-01 03:03:04,2019-03-01,story_1,0,earnGemsCounter,gems,-22
1,13757,2019-03-01 03:35:53,2019-03-01,story_1,0,earnGemsCounter,gems,-22
2,13757,2019-03-01 03:52:10,2019-03-01,story_2,0,earnGemsCounter,gems,-22
3,10009,2019-03-01 04:10:00,2019-03-01,story_1,0,earnGemsCounter,gems,-22
4,10009,2019-03-01 04:26:46,2019-03-01,story_2,0,earnGemsCounter,gems,-22


In [9]:
# display data
users.head()

Unnamed: 0,user_id,install_date,lang,country,hw_ver,os_ver
0,0,2019-03-01,en,US,"iPhone4,1",9.1
1,1,2019-03-01,en,IN,"iPod5,1",8.1.2
2,2,2019-03-06,en,US,"iPod7,1",8.4.1
3,3,2019-03-03,nb,NO,"iPhone8,1",9.2.1
4,4,2019-03-03,en,GB,"iPhone5,4",9.2.1


In [10]:
print(len(iaps))
print(len(sessions))
print(len(spendevents))
print(len(users))

6685
722955
107764
22576


In [16]:
df1 = pd.merge(iaps, users, on='user_id', how='outer')
print(len(df1))

27735


In [None]:
df = pd.merge(iaps, sessions, spendevents, users, on='user_id', how='outer')