In [1]:
import os
import pathlib
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
!ls ../input/

age_gender_bkts.csv	   save		   train_users_2.csv
countries.csv		   sessions.csv
sample_submission_NDF.csv  test_users.csv


In [4]:
def detect_nan_strings(df):
    for column in df.columns:
        print("\n\ncolumn: {}:".format(column))
        print("-" * 20)
        try:
            print("is NAN string:\n", any(df[column] == 'NAN'))
            print("is NAN string:\n", any(df[column] == 'nan'))
            print("is NAN string:\n", any(df[column] == 'NaN'))
        except TypeError:
            print("column {} not str type".format(column))

In [5]:
def uniques_count(df):
    return {column: df[column].groupby(by=df[column]).count() for column in df.columns}

In [6]:
age_gender_df = pd.read_csv('../input/age_gender_bkts.csv')

In [7]:
countries_df = pd.read_csv('../input/countries.csv')

In [8]:
sessions_df = pd.read_csv('../input/sessions.csv')

In [9]:
train_users_2_df = pd.read_csv('../input/train_users_2.csv')

In [10]:
test_users_df = pd.read_csv('../input/test_users.csv')

age_gender_df.info()

age_gender_df.head()

age_gender_df.describe()

countries_df.info()

countries_df

sessions_df.info()

sessions_df.head()

sessions_df.describe()

train_users_2_df.info()

train_users_2_df.head()

train_users_2_df.describe()

In [11]:
sessions_df_not_na = sessions_df.notna()

In [12]:
sessions_df.shape

(10567737, 6)

In [13]:
sessions_df.shape

(10567737, 6)

In [14]:
sessions_df.isna().sum()

user_id            34496
action             79626
action_type      1126204
action_detail    1126204
device_type            0
secs_elapsed      136031
dtype: int64

In [15]:
sessions_df.isna().sum() / len(sessions_df) * 100

user_id           0.326428
action            0.753482
action_type      10.657003
action_detail    10.657003
device_type       0.000000
secs_elapsed      1.287229
dtype: float64

for column in sessions_df.columns:
    print("\n\ncolumn: {}:".format(column))
    print("-" * 20)
    print("value_counts:\n", sessions_df[column].value_counts())

In [16]:
train_users_2_df.shape

(213451, 16)

In [17]:
train_users_2_df.isna().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

In [18]:
train_users_2_df.isna().sum() / len(train_users_2_df) * 100

id                          0.000000
date_account_created        0.000000
timestamp_first_active      0.000000
date_first_booking         58.347349
gender                      0.000000
age                        41.222576
signup_method               0.000000
signup_flow                 0.000000
language                    0.000000
affiliate_channel           0.000000
affiliate_provider          0.000000
first_affiliate_tracked     2.841402
signup_app                  0.000000
first_device_type           0.000000
first_browser               0.000000
country_destination         0.000000
dtype: float64

for column in train_users_2_df.columns:
    print("\n\ncolumn: {}:".format(column))
    print("-" * 20)
    print("value_counts:\n", train_users_2_df[column].value_counts())

In [19]:
sessions_df_unique_user_ids = set(sessions_df['user_id'].unique())

In [20]:
train_users_2_df_unique_user_ids = set(train_users_2_df['id'].unique())

In [21]:
len(sessions_df_unique_user_ids)

135484

In [22]:
len(train_users_2_df_unique_user_ids)

213451

In [23]:
len(sessions_df_unique_user_ids.union(train_users_2_df_unique_user_ids))

275120

In [24]:
len(sessions_df_unique_user_ids.difference(train_users_2_df_unique_user_ids))

61669

In [25]:
len(train_users_2_df_unique_user_ids.difference(sessions_df_unique_user_ids))

139636

In [26]:
len(train_users_2_df_unique_user_ids.intersection(sessions_df_unique_user_ids))

73815

Preliminary results:
1. We can see that some columns in sessions data and users data have signficant numbers of NaN values.
sessions:
user_id           0.326428% (which is curious in itself)
action            0.753482%
action_type      10.657003%
action_detail    10.657003%
device_type       0.000000%
secs_elapsed      1.287229%

users:
id                          0.000000%
date_account_created        0.000000%
timestamp_first_active      0.000000%
date_first_booking         58.347349%
gender                      0.000000%
age                        41.222576%
signup_method               0.000000%
signup_flow                 0.000000%
language                    0.000000%
affiliate_channel           0.000000%
affiliate_provider          0.000000%
first_affiliate_tracked     2.841402%
signup_app                  0.000000%
first_device_type           0.000000%timestamp_first_active
first_browser               0.000000%
country_destination         0.000000%

2. user_id column in sessions data contain 0.326428% NaNs, rows with will be deleted when we  will simultaneously use data from sessions and users datasets (or may be we find the way fill this values).
3. Deciding what to do with the NaNs in columns action_type and action_detail (Delete or fill by average, mode or by values that we can retrieve using simple ML models) requires additional exploring, but now we delete this rows
4. NaNs in secs_elapsed in session data we can fill by mean or mode.
5. NaNs in first_affiliate_tracked in users data make sense delete or fill by mode.
6. NaNs in date_first_booking requires additional exploring. Or delete whole column if additional exploring don't give result.
7. Same for age column.
8. It is also worth paying attention to the fact that the set of user IDs in sessions dataset and users dataset  do not match, but only intersect in some part of thes. There are users whose IDs are in session dataset  but not in users dataset, which
strange.
9. It is also worthwhile to look in more detail at the distribution of the number of unique values in each column of each dataset.
10. Add columns with timestamps converted in datetime format.

In [27]:
sessions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10567737 entries, 0 to 10567736
Data columns (total 6 columns):
user_id          object
action           object
action_type      object
action_detail    object
device_type      object
secs_elapsed     float64
dtypes: float64(1), object(5)
memory usage: 483.8+ MB


In [28]:
train_users_2_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [29]:
train_users_2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB


In [30]:
sessions_df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [31]:
sessions_df.count()

user_id          10533241
action           10488111
action_type       9441533
action_detail     9441533
device_type      10567737
secs_elapsed     10431706
dtype: int64

In [32]:
#train_users_2_df['timestamp_first_active_as_dt'] = train_users_2_df['timestamp_first_active'].astype(np.datetime64)

In [33]:
train_users_2_df.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [34]:
sessions_uniques_count = uniques_count(sessions_df)

for key, value in sessions_uniques_count.items():
    print("\n", key)
    print("-" * 20)
    print(value)

In [35]:
#isinstance(train_users_2_df.signup_method.dtype, np.object)
isinstance(train_users_2_df.age.dtype, np.float64)

False

In [36]:
train_users_2_df.age.dtype

dtype('float64')

In [37]:
train_users_2_df.age.dtype == np.object

False

In [38]:
session_actions = sessions_df.action

In [39]:
sa_v_counts = session_actions.value_counts()

In [40]:
sa_v_counts.head()

show              2768278
index              843699
search_results     725226
personalize        706824
search             536057
Name: action, dtype: int64

In [41]:

#session_actions.

fig = plt.figure(figsize= (24, 18))
ax = fig.add_subplot(111)
ax.hist(session_actions.value_counts()[:100], bins=10)
#ax.bar(sa_v_counts.index, sa_v_counts.values)
ax.set_xlabel("action")
ax.set_ylabel("number")
ax.set_title("action hist")


#ax.hist(train_users_2_df.timestamp_first_active.values)
plt.show()

In [42]:
sa_v_counts.index

Index(['show', 'index', 'search_results', 'personalize', 'search',
       'ajax_refresh_subtotal', 'update', 'similar_listings',
       'social_connections', 'reviews',
       ...
       'update_message', 'wishlists', 'acculynk_bin_check_failed',
       'host_cancel', 'set_minimum_payout_amount', 'deauthorize', 'nyan',
       'desks', 'deactivated', 'deactivate'],
      dtype='object', length=359)

In [43]:
#train_users_2_df.timestamp_first_active.hist()

In [44]:
#del ax

In [45]:
#del fig

fig2 = plt.figure(figsize=(24, 18))
ax2 = fig2.add_subplot(111)
ax2.hist(train_users_2_df.timestamp_first_active.values, bins=60)
#ax2.plot([1, 2, 3, 4], [5, 6, 7 , 8])
ax2.set_xlabel("action")
ax2.set_ylabel("number")
ax2.set_title("action hist")
#ax.hist(session_actions.value_counts(), bins=10)
#ax.bar(sa_v_counts.index, sa_v_counts.values)

plt.show()

In [46]:
#del ax2
#del fig2

In [47]:
#cleaned_sessions_df = sessions_df[sessions_df['user_id'] == np.NaN]
cleaned_sessions_df = sessions_df.dropna(subset=('user_id',))

In [48]:
print(sessions_df.shape)

(10567737, 6)


In [49]:
print(cleaned_sessions_df.shape)

(10533241, 6)


In [50]:
sessions_df.shape[0] - cleaned_sessions_df.shape[0] == sessions_df['user_id'].isna().sum()

True

In [51]:
action_count = sessions_df['action'].groupby(by=sessions_df['action']).count()

In [52]:
action_count.mode()

0    2
dtype: int64

In [53]:
action_count.max()

2768278

In [54]:
action_count_sorted =action_count.sort_values()

In [55]:
action_count_sorted.tail()

action
search             536057
personalize        706824
search_results     725226
index              843699
show              2768278
Name: action, dtype: int64

In [56]:
cleaned_sessions_df.fillna?

In [57]:
cleaned_sessions_df.fillna(value={'action': 2768278}, inplace=True)

In [58]:
any(cleaned_sessions_df['action'].isna())

False

Consider more rows containing NaNs in action_type and action_detail

In [59]:
act_det_nans_df = sessions_df[sessions_df['action_detail'].isna()]

In [60]:
act_det_nans_df.shape

(1126204, 6)

In [61]:
act_det_nans_df.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0


In [62]:
vc_dict = {}
for column in act_det_nans_df.columns.drop(['action_type', 'action_detail']):
    print("\n\n", column)
    print("-" * 30)
    values_count = act_det_nans_df[column].groupby(by=act_det_nans_df[column]).count()
    vc_dict[column] = values_count
    print(values_count)



 user_id
------------------------------
user_id
00023iyk9l      3
0010k6l0om     15
001wyh0pz8      5
002qnbzfs5     77
0035hobuyj    171
00378ocvlh      4
00389675gq     14
003iamz20l      6
0048rkdgb1      1
005jc3nbmb      1
005v5uf4dh      1
0063bawn05      2
006ml14zc1      3
006mls2sjw     42
006t3vhawl      1
0075z9e9xv     79
009a40t3dk      2
00a8dbifj8     11
00an0o6c07      2
00b9hfwaak     10
00bn6hu437      1
00bowi9sn3      1
00bxmflswn      4
00cdlcsaxu      2
00cu8ilsh0     27
00dj8as8yl      2
00du2h6jcx      4
00e4nxhdvv     25
00ep2pem90      1
00epe0uaxo      6
             ... 
zzlwz6avyg      4
zzmiifrmxm      5
zzn5ittlxb    148
zznbshw5p7      2
zzngxvqfm5     14
zzoweugiba     13
zzoxgcuxxj      1
zzq3qpp9av      8
zzq55plahq      5
zzq90ckj2z      1
zzqb2sn066      1
zzr1rkdsh9     14
zzrnx9rqi5      8
zzsslqqkee      1
zzswqjsqy4      8
zzt5gpsyqa     11
zzt8w28nl5     47
zztkv1gkse      4
zzu3u71odh     20
zzunwylrfv     24
zzv6z6rjr9     13
zzv8sgicbk    

In [63]:
act_det_not_nans_df = sessions_df[sessions_df['action_detail'].notna()]

In [64]:
act_det_nans_values_count = uniques_count(act_det_nans_df)

In [65]:
act_det_not_nans_values_count = uniques_count(act_det_not_nans_df)

In [66]:
user_id_nans = act_det_nans_values_count['user_id']

In [67]:
user_id_not_nans = act_det_not_nans_values_count['user_id']

In [68]:
user_id_nans_set = set(user_id_nans.index)
user_id_not_nans_set = set(user_id_not_nans.index)

In [69]:
print(len(user_id_nans_set), len(user_id_not_nans), len(user_id_nans_set.intersection(user_id_not_nans_set)))

101102 135478 101097


In [70]:
user_id_nans_only_set = user_id_nans_set.difference(user_id_not_nans_set)

In [71]:
len(user_id_nans_only_set)

5

In [72]:
user_id_nans_only_set

{'16o67suu17', '4lw10qzgsd', '4s2v2hmngj', 'lwomddykjk', 'qldsx3hzys'}

In [73]:
user_id_nans.index

Index(['00023iyk9l', '0010k6l0om', '001wyh0pz8', '002qnbzfs5', '0035hobuyj',
       '00378ocvlh', '00389675gq', '003iamz20l', '0048rkdgb1', '005jc3nbmb',
       ...
       'zzv6z6rjr9', 'zzv8sgicbk', 'zzvatt4dio', 'zzvg4emw5w', 'zzx0hfd74v',
       'zzx5wp3lqu', 'zzxox7jnrx', 'zzy7t0y9cm', 'zzywmcn0jv', 'zzzlylp57e'],
      dtype='object', name='user_id', length=101102)

In [74]:
for index in user_id_nans_only_set:
    print(user_id_nans[index])

1
1
1
1
1


In [75]:
user_id_nans['16o67suu17']

1

In [76]:
act_det_nans_action =  act_det_nans_values_count['action']

In [77]:
len(act_det_nans_action)

18

In [78]:
act_det_not_nans_action = act_det_not_nans_values_count['action']

In [79]:
len(act_det_not_nans_action)

347

In [80]:
act_det_nans_device_type = act_det_nans_values_count['device_type']
act_det_not_nans_device_type = act_det_not_nans_values_count['device_type']

In [81]:
print(len(act_det_nans_device_type))
print(len(act_det_not_nans_device_type))

14
14


In [82]:
for column in act_det_nans_values_count.keys():
    print("\n\n", column)
    print("-" * 30)
    print(len(act_det_nans_values_count[column]))
    print(len(act_det_not_nans_values_count[column]))



 user_id
------------------------------
101102
135478


 action
------------------------------
18
347


 action_type
------------------------------
0
10


 action_detail
------------------------------
0
155


 device_type
------------------------------
14
14


 secs_elapsed
------------------------------
66180
327145


In [83]:
action_act_det_nans_set = set(act_det_nans_values_count['action'].index)
action_act_det_not_nans_set = set(act_det_not_nans_values_count['action'].index)

In [84]:
len(action_act_det_nans_set.union(action_act_det_not_nans_set))

359

In [85]:
intersected_actions = action_act_det_nans_set.intersection(action_act_det_not_nans_set)
only_nans_actions = action_act_det_nans_set.difference(action_act_det_not_nans_set)

In [86]:
print(intersected_actions)
print(only_nans_actions)

{'index', 'show', 'currencies', 'campaigns', 'update', 'localization_settings'}
{'check', 'track_activity', 'widget', 'track_page_view', 'braintree_client_token', 'lookup', 'satisfy', 'uptodate', 'signed_out_modal', 'phone_verification', 'disaster_action', 'similar_listings_v2'}


In [87]:
#act_det_nans_action[act_det_nans_action.isin(intersected_actions)]

In [88]:
#act_det_nans_action[act_det_nans_action.isin(only_nans_actions)]

In [89]:
act_det_nans_action[intersected_actions]

action
index                     16733
show                     582144
currencies                  292
campaigns                104709
update                      225
localization_settings      5403
Name: action, dtype: int64

In [90]:
act_det_nans_action[only_nans_actions]

action
check                        120
track_activity                 6
widget                        75
track_page_view            81117
braintree_client_token       120
lookup                    162041
satisfy                        9
uptodate                    3342
signed_out_modal            1058
phone_verification            16
disaster_action                6
similar_listings_v2       168788
Name: action, dtype: int64

In [91]:
act_det_not_nans_action[intersected_actions]

action
index                     826966
show                     2186134
currencies                     5
campaigns                    319
update                    364905
localization_settings         11
Name: action, dtype: int64

In [92]:
#act_det_not_nans_action[only_nans_actions]

In [93]:
for act in intersected_actions:
    print("\n\n", act)
    print("*" * 30)
    print("act_det_nans_action[{}]:".format(act), act_det_nans_action[act])
    print("percent:", act_det_nans_action[act] / act_det_nans_action.sum() * 100)
    print("act_det_not_nans_action[{}]:".format(act), act_det_not_nans_action[act])
    print("percent:", act_det_not_nans_action[act] / act_det_not_nans_action.sum() * 100)



 index
******************************
act_det_nans_action[index]: 16733
percent: 1.4857876548120943
act_det_not_nans_action[index]: 826966
percent: 8.83330714564885


 show
******************************
act_det_nans_action[show]: 582144
percent: 51.69081267692176
act_det_not_nans_action[show]: 2186134
percent: 23.351374885480062


 currencies
******************************
act_det_nans_action[currencies]: 292
percent: 0.02592780704028755
act_det_not_nans_action[currencies]: 5
percent: 5.340792212526785e-05


 campaigns
******************************
act_det_nans_action[campaigns]: 104709
percent: 9.297516258155717
act_det_not_nans_action[campaigns]: 319
percent: 0.003407425431592089


 update
******************************
act_det_nans_action[update]: 225
percent: 0.019978618438577733
act_det_not_nans_action[update]: 364905
percent: 3.897763564624173


 localization_settings
******************************
act_det_nans_action[localization_settings]: 5403
percent: 0.4797532241050467
a

for act in only_nans_actions:
    print("\n\n", act)
    print("*" * 30)
    print("act_det_nans_action[{}]:".format(act), act_det_nans_action[act])
    print("percent:", act_det_nans_action[act] / act_det_nans_action.sum() * 100)
    #print("act_det_not_nans_action[{}]:".format(act), act_det_not_nans_action[act])
    #print("percent:", act_det_not_nans_action[act] / act_det_not_nans_action.sum() * 100)

This is make sense drop all rows that in 'action' column have one of the values that contains in only_nans_actions set
In rows that contains in 'action' column values from intersected_actions set we can substitude (in columns 'action_type' and 'action_detail' values from rows with same value in 'action' (but not have NaNs in 'action_type' and 'action_detail')

In [94]:
act_det_not_nans_action['localization_settings']

11

In [95]:
type(act_det_not_nans_action)

pandas.core.series.Series

In [96]:
act_det_not_nans_df[act_det_not_nans_df['action'] == 'localization_settings']

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
3421,toga865pvz,localization_settings,-unknown-,-unknown-,-unknown-,1137.0
9421,9svjpkjzmd,localization_settings,-unknown-,-unknown-,-unknown-,89697.0
9999,aafnnpqdu8,localization_settings,-unknown-,-unknown-,-unknown-,1751529.0
13636,2vxv32sfib,localization_settings,-unknown-,-unknown-,-unknown-,25784.0
32396,a1r3f08h08,localization_settings,-unknown-,-unknown-,-unknown-,986185.0
46035,kx3u4zmh5x,localization_settings,-unknown-,-unknown-,-unknown-,33515.0
60249,dfzyr7uhzj,localization_settings,-unknown-,-unknown-,-unknown-,203265.0
68376,zrsq9edttm,localization_settings,-unknown-,-unknown-,-unknown-,80486.0
95626,gggj3wmmwb,localization_settings,-unknown-,-unknown-,-unknown-,989683.0
96826,ovkov1dijr,localization_settings,-unknown-,-unknown-,-unknown-,117746.0


In [97]:
#act_det_not_nans_df[act_det_not_nans_df['action'] == 'show']

In [98]:
act_show_not_nans_df = act_det_not_nans_df[act_det_not_nans_df['action'] == 'show']

In [99]:
#act_show_not_nans_df['action_type'].groupby(by=act_show_not_nans_df['action_type']).count()

In [None]:
#act_show_not_nans_df['action_detail'].groupby(by=act_show_not_nans_df['action_detail']).count()

In [105]:
for act in intersected_actions:
    print("\n\n\naction value:", act)
    print("*" * 30)
    act_type_df = act_det_not_nans_df[act_det_not_nans_df['action'] == act]
    act_type = act_type_df['action_type'].groupby(by=act_type_df['action_type'])
    act_type_count = act_type.count()
    print("count:\n", act_type_count)
    print("proportion:\n", act_type_count / act_type_count.sum())
    print()
    act_det = act_type_df['action_detail'].groupby(by=act_type_df['action_detail'])
    act_det_count = act_det.count()
    print("count:\n", act_det_count)
    print("proportion:\n", act_det_count / act_det_count.sum())




action value: index
******************************
count:
 action_type
-unknown-    112155
data          35112
view         679699
Name: action_type, dtype: int64
proportion:
 action_type
-unknown-    0.135622
data         0.042459
view         0.821919
Name: action_type, dtype: float64

count:
 action_detail
-unknown-                      112155
account_payment_methods          1005
homepage                           15
listing_descriptions             5735
message_inbox                   18718
message_thread                  67744
reservations                    32479
user_tax_forms                   2633
user_wishlists                  18357
view_ghosting_reasons              19
view_ghostings                     19
view_identity_verifications        38
view_locations                   7854
view_reservations                9460
view_resolutions                   42
view_search_results            519675
view_user_real_names               38
your_listings                   30980
Na