In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Draw inline
%matplotlib inline

# Set figure aesthetics
sns.set_style("white", {'ytick.major.size': 10.0})
sns.set_context("poster", font_scale=1.1)



# Date Cleaning
## Fix Missing or Abberant Values

1) Detect missing and abberant.
2) Replace/impute

Pieces borrowed from: https://www.kaggle.com/davidgasquez/airbnb-recruiting-new-user-bookings/user-data-exploration

In [61]:
# Load data into DataFrames
train_users = pd.read_csv('input/train_users_2.csv')
test_users = pd.read_csv('input/test_users.csv')

In [8]:
print("We have", train_users.shape[0], "users in the training set and", 
      test_users.shape[0], "in the test set.")
print("In total we have", train_users.shape[0] + test_users.shape[0], "users.")

We have 213451 users in the training set and 62096 in the test set.
In total we have 275547 users.


In [54]:
# Save Train Length for Splitting
piv_train = train_users.shape[0]

In [9]:
# Merge train and test users
users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

# Remove ID's since now we are not interested in making predictions
#users.drop('id',axis=1, inplace=True)

users.head()

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,-unknown-,en,Web,0,facebook,20090319043255
1,seo,google,38.0,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,en,Web,0,facebook,20090523174809
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,en,Web,3,basic,20090609231247
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,en,Web,0,facebook,20091031060129
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,-unknown-,en,Web,0,basic,20091208061105


In [12]:
## Replace All Missing Values with Nan
users['gender'].replace('-unknown-',np.nan, inplace=True)
users['first_affiliate_tracked'].replace('',np.nan, inplace=True)


## How much is missing?

In [52]:
## Examing how much data is missing

users_nan = (users.isnull().sum() / users.shape[0]) * 100
users_nan[users_nan > 0].drop('country_destination')

date_first_booking    67.733998
dtype: float64

In [35]:
## Is newer data better? -- NO
test_users['gender'].replace('-unknown-',np.nan, inplace=True)
users_nan = (test_users.isnull().sum() / test_users.shape[0]) * 100
users_nan[users_nan > 0]

date_first_booking         100.000000
gender                      54.418964
age                         46.502190
first_affiliate_tracked      0.032208
dtype: float64

In [37]:
# Gender in test users
test_users['gender'].value_counts(dropna=False)

NaN       33792
FEMALE    14483
MALE      13769
OTHER        52
Name: gender, dtype: int64

Quite a lot missing in Age and Gender.
Some missing in first_affilate_tracked

### Age

In [19]:
users['age'].describe()

count    155825.000000
mean         36.012418
std          11.553568
min          15.000000
25%          28.000000
50%          33.000000
75%          42.000000
max          95.000000
Name: age, dtype: float64

In [18]:
users.loc[users.age > 95, 'age'] = np.nan
users.loc[users.age < 13, 'age'] = np.nan

### Gender

In [30]:
users['gender'].value_counts(dropna=False)

NaN       129480
FEMALE     77524
MALE       68209
OTHER        334
dtype: int64

### First Affiliate Tracked

In [40]:
users['first_affiliate_tracked'].value_counts(dropna=False)

untracked        143181
linked            62064
omg               54859
tracked-other      6655
NaN                6085
product            2353
marketing           281
local ops            69
dtype: int64

### Categorical Features

# IMPUTATION / REPLACING VALUES

### Age

In [43]:
users['age'].fillna(-1,inplace=True)

### Gender

In [48]:
users['gender'].cat.add_categories('MISSING',inplace=True)
users['gender'].fillna('MISSING',inplace=True)

### First Affiliate Tracked

In [51]:
users['first_affiliate_tracked'].fillna('untracked',inplace=True)

# COMBINED PROCESSING

In [63]:
## AGE

# Replace Out of Range Values
train_users.loc[train_users.age > 95, 'age'] = -1
train_users.loc[train_users.age < 13, 'age'] = -1
train_users.fillna(-1,inplace=True)

## GENDER
train_users.fillna('MISSING',inplace=True)

## FIRST AFFILIATE TRACKED
train_users['first_affiliate_tracked'].fillna('untracked',inplace=True)

In [60]:
def preprocess_users(df):
    ##AGE
    df.loc[df.age > 95, 'age'] = -1
    df.loc[df.age < 13, 'age'] = -1
    df.fillna(-1,inplace=True)

    ## GENDER
    df.fillna('MISSING',inplace=True)

    ## FIRST AFFILIATE TRACKED
    df['first_affiliate_tracked'].fillna('untracked',inplace=True)
    
    return df


In [62]:
A = preprocess_users(train_users)

In [64]:
A.equals(train_users)

True

In [53]:
users.head(100)

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,-1,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,MISSING,en,Web,0,facebook,20090319043255
1,seo,google,38,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,en,Web,0,facebook,20090523174809
2,direct,direct,56,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,en,Web,3,basic,20090609231247
3,direct,direct,42,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,en,Web,0,facebook,20091031060129
4,direct,direct,41,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,MISSING,en,Web,0,basic,20091208061105
5,other,other,-1,US,2010-01-01,2010-01-02,omg,Chrome,Mac Desktop,MISSING,en,Web,0,basic,20100101215619
6,other,craigslist,46,US,2010-01-02,2010-01-05,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100102012558
7,direct,direct,47,US,2010-01-03,2010-01-13,omg,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100103191905
8,other,craigslist,50,US,2010-01-04,2010-07-29,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100104004211
9,other,craigslist,46,US,2010-01-04,2010-01-04,omg,Firefox,Mac Desktop,MISSING,en,Web,0,basic,20100104023758


In [65]:
import preprocess_clean

In [66]:
preprocess_users

<function __main__.preprocess_users>

In [20]:
## Encode catogorical features as categories

categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'country_destination',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_flow',
    'signup_method'
]

for categorical_feature in categorical_features:
    users[categorical_feature] = users[categorical_feature].astype('category')

In [21]:
users

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,language,signup_app,signup_flow,signup_method,timestamp_first_active
0,direct,direct,,NDF,2010-06-28,,untracked,Chrome,Mac Desktop,,en,Web,0,facebook,20090319043255
1,seo,google,38,NDF,2011-05-25,,untracked,Chrome,Mac Desktop,MALE,en,Web,0,facebook,20090523174809
2,direct,direct,56,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,en,Web,3,basic,20090609231247
3,direct,direct,42,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,en,Web,0,facebook,20091031060129
4,direct,direct,41,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,,en,Web,0,basic,20091208061105
5,other,other,,US,2010-01-01,2010-01-02,omg,Chrome,Mac Desktop,,en,Web,0,basic,20100101215619
6,other,craigslist,46,US,2010-01-02,2010-01-05,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100102012558
7,direct,direct,47,US,2010-01-03,2010-01-13,omg,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100103191905
8,other,craigslist,50,US,2010-01-04,2010-07-29,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,20100104004211
9,other,craigslist,46,US,2010-01-04,2010-01-04,omg,Firefox,Mac Desktop,,en,Web,0,basic,20100104023758
