In [1]:
import numpy as np
import pandas as pd
import os, os.path
from sklearn.ensemble import RandomForestClassifier

In [2]:
DIR = "/home/gtrindadi/jds_june_2019/data-sendatree/"

# minus 1 because the last file is note the daily data
n_files = len([name for name in os.listdir(DIR+"data-registration/") 
               if os.path.isfile(os.path.join(DIR+"data-registration/", name))]) - 1
print (n_files)

273


# Registrations

In [3]:
column_names_reg = ['file_name', 'date', 'time', 'user_id', 'event', 
                    'birth_year', 'device_type', 'country', 'source']

registrations = pd.DataFrame()

# this activity was made when we were with 189 files (days)
for n in range(1, n_files+1):
    aux = pd.read_csv(DIR+'data-registration/day_'+str(n), delimiter=' ', names=column_names_reg)
    registrations = pd.concat([registrations, aux])

In [4]:
registrations = registrations.reset_index(drop='True')

In [5]:
registrations.tail()

Unnamed: 0,file_name,date,time,user_id,event,birth_year,device_type,country,source
284417,day_273,2019-10-17,23:58:40,1284418,registration,1991,ios,philippines,article
284418,day_273,2019-10-17,23:58:57,1284419,registration,1989,ios,brazil,paid
284419,day_273,2019-10-17,23:59:08,1284420,registration,1985,android,sweden,google
284420,day_273,2019-10-17,23:59:22,1284421,registration,1991,error,united_states,invite_a_friend
284421,day_273,2019-10-17,23:59:47,1284422,registration,1987,error,united_states,invite_a_friend


# Free Tree Sends

In [6]:
column_names_sends = ['file_name', 'date', 'user_id', 'event']

free_tree = pd.read_csv(DIR+'data-free-tree/free-tree-all-data.csv', delimiter=' ', names=column_names_sends)

In [7]:
free_tree.tail()

Unnamed: 0,file_name,date,user_id,event
2595564,day_273,2019-10-17,1284422,sent_a_free_tree
2595565,day_273,2019-10-17,1284422,sent_a_free_tree
2595566,day_273,2019-10-17,1284422,sent_a_free_tree
2595567,day_273,2019-10-17,1284422,sent_a_free_tree
2595568,day_273,2019-10-17,1284422,sent_a_free_tree


In [8]:
free_tree.count()

file_name    2595569
date         2595569
user_id      2595569
event        2595569
dtype: int64

## Super Tree Sends

In [9]:
super_tree = pd.DataFrame()

# this activity was made when we were with 189 files (days)
for n in range(1, n_files+1):
    aux = pd.read_csv(DIR+'data-super-tree/day_'+str(n), delimiter=' ', names=column_names_sends)
    super_tree = pd.concat([super_tree, aux])

In [10]:
super_tree = super_tree.reset_index(drop=True)

In [11]:
super_tree.tail()

Unnamed: 0,file_name,date,user_id,event
175838,day_273,2019-10-17,1277650,sent_a_super_tree
175839,day_273,2019-10-17,1275865,sent_a_super_tree
175840,day_273,2019-10-17,1281894,sent_a_super_tree
175841,day_273,2019-10-17,1281894,sent_a_super_tree
175842,day_273,2019-10-17,1281894,sent_a_super_tree


In [12]:
super_tree.count()

file_name    175843
date         175843
user_id      175843
event        175843
dtype: int64

# Figuring out the real quantity of the error values -- in two ways

In [13]:
registrations.groupby('device_type').count()[['file_name']]

Unnamed: 0_level_0,file_name
device_type,Unnamed: 1_level_1
android,124249
error,110479
ios,49694


In [14]:
registrations[registrations.device_type == 'error'].count()['file_name']

110479

# Figuring out the real values of the error values

1: preparing my data <br>
2: train and test with the data that have a real device_type (android or ios) <br>
3: see the accuracy of this model <br>
4: adjust until you have a nice accuracy <br>
5: apply the model for error cases

## Preparing my data (step 1)

1: get the free_tree_count and super_tree_count <br>
2: get only important columns (user_id, device_type, source) <br>
3: merge the registrations, free_tree_count, super_tree_count dfs <br>
4: turn the 'NaN' values into 0 <br>
5: turn the device_type and source values in number <br>
6: separate the data in known_values (ios and android) and unknown_values (error) of device column

In [15]:
free_tree_sends = free_tree.groupby('user_id').count()[['file_name']]
free_tree_sends = free_tree_sends.rename(columns={'file_name':'free_sends'})
free_tree_sends = free_tree_sends.reset_index()
free_tree_sends.head()

Unnamed: 0,user_id,free_sends
0,1000001,5
1,1000002,4
2,1000003,37
3,1000005,6
4,1000006,4


In [16]:
super_tree_sends = super_tree.groupby('user_id').count()[['file_name']]
super_tree_sends = super_tree_sends.rename(columns={'file_name':'super_sends'})
super_tree_sends = super_tree_sends.reset_index()
super_tree_sends.head()

Unnamed: 0,user_id,super_sends
0,1000007,1
1,1000010,6
2,1000011,2
3,1000013,1
4,1000019,1


In [17]:
all_data = registrations[['user_id', 'device_type', 'source']].merge(
    free_tree_sends, how='left', left_on='user_id', right_on='user_id')

all_data.head()

Unnamed: 0,user_id,device_type,source,free_sends
0,1000001,android,invite_a_friend,5.0
1,1000002,ios,invite_a_friend,4.0
2,1000003,error,invite_a_friend,37.0
3,1000004,error,invite_a_friend,
4,1000005,ios,invite_a_friend,6.0


In [18]:
all_data = all_data.fillna('0')

all_data.head()

Unnamed: 0,user_id,device_type,source,free_sends
0,1000001,android,invite_a_friend,5
1,1000002,ios,invite_a_friend,4
2,1000003,error,invite_a_friend,37
3,1000004,error,invite_a_friend,0
4,1000005,ios,invite_a_friend,6


In [19]:
all_data = all_data.merge(super_tree_sends, how='left', 
                          left_on='user_id', right_on='user_id').fillna('0')

all_data.tail()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
284417,1284418,ios,article,1,0
284418,1284419,ios,paid,0,0
284419,1284420,android,google,1,0
284420,1284421,error,invite_a_friend,9,4
284421,1284422,error,invite_a_friend,5,0


In [53]:
# replace device_type values
# error to 0
# android to 1
# ios to 2

data_replaced = all_data.replace('ios', 2)
data_replaced = data_replaced.replace('android', 1)
data_replaced = data_replaced.replace('error', 0)

data_replaced.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
0,1000001,1,invite_a_friend,5,0
1,1000002,2,invite_a_friend,4,0
2,1000003,0,invite_a_friend,37,0
3,1000004,0,invite_a_friend,0,0
4,1000005,2,invite_a_friend,6,0


In [54]:
all_data.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
0,1000001,android,invite_a_friend,5,0
1,1000002,ios,invite_a_friend,4,0
2,1000003,error,invite_a_friend,37,0
3,1000004,error,invite_a_friend,0,0
4,1000005,ios,invite_a_friend,6,0


In [55]:
# replace source values
# invite_a_friend to 0
# google to 1
# article to 2
# paid to 3

data_replaced = data_replaced.replace('invite_a_friend', 0)
data_replaced = data_replaced.replace('google', 1)
data_replaced = data_replaced.replace('article', 2)
data_replaced = data_replaced.replace('paid', 3)

data_replaced.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
0,1000001,1,0,5,0
1,1000002,2,0,4,0
2,1000003,0,0,37,0
3,1000004,0,0,0,0
4,1000005,2,0,6,0


In [56]:
# creating a dataframe with observations that have a real device_type
known_values_device = data_replaced[(data_replaced.device_type==1) |
                                    (data_replaced.device_type==2)]

known_values_device.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
0,1000001,1,0,5,0
1,1000002,2,0,4,0
4,1000005,2,0,6,0
5,1000006,1,0,4,0
6,1000007,1,0,11,1


In [57]:
# creating a dataframe with observations that have a error device_type
unknown_values_device = data_replaced[(data_replaced.device_type==0)]

unknown_values_device.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
2,1000003,0,0,37,0
3,1000004,0,0,0,0
9,1000010,0,0,1,6
10,1000011,0,0,11,2
11,1000012,0,0,35,0


## Machine Learning stuff (from step 2 onwards)

In [58]:
#device_type_dictionary = {'android': 1, 'ios': 2}
#source_dictionary = {'invite_a_friend': 0, 'google': 1, 'article': 2, 'paid': 3}

In [60]:
#known_values_device['device_type'] = known_values_device['device_type'].map(device_type_dictionary)
#known_values_device['source'] = known_values_device['source'].map(source_dictionary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [61]:
x = known_values_device[['free_sends', 'super_sends', 'source']]
y = known_values_device['device_type']
model = RandomForestClassifier(n_estimators=100)
model = model.fit(x,y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [62]:
known_values_device.head()

Unnamed: 0,user_id,device_type,source,free_sends,super_sends
0,1000001,,,5,0
1,1000002,,,4,0
4,1000005,,,6,0
5,1000006,,,4,0
6,1000007,,,11,1
