# Project 3: Predicting new Airbnb users' first destinations
# Part 2: SQL and EDA

## Using SQL to perform some EDA

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
from patsy import dmatrix
import datetime as dt
import numpy as np
import pickle

%matplotlib inline

from sqlalchemy import create_engine
cnx = create_engine('postgresql://emilygeller:p@54.173.47.58:5432/emilygeller')

In [2]:
pd.read_sql('select * from countries;', cnx)

Unnamed: 0,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
1,CA,62.393303,-96.818146,2828.1333,9984670.0,eng,0.0
2,DE,51.165707,10.452764,7879.568,357022.0,deu,72.61
3,ES,39.896027,-2.487694,7730.724,505370.0,spa,92.25
4,FR,46.232193,2.209667,7682.945,643801.0,fra,92.06
5,GB,54.63322,-3.432277,6883.659,243610.0,eng,0.0
6,IT,41.87399,12.564167,8636.631,301340.0,ita,89.4
7,NL,52.133057,5.29525,7524.3203,41543.0,nld,63.22
8,PT,39.553444,-7.839319,7355.2534,92090.0,por,95.45
9,US,36.966427,-95.84403,0.0,9826675.0,eng,0.0


In [3]:
pd.read_sql('select * from sessions limit 5;', cnx)

Unnamed: 0,id,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


## Constructing features from Sessions data
I'll use SQL to build them.

###  Total seconds on site per user

In [4]:
q = '''
select user_id, sum(secs_elapsed)
from sessions
group by user_id
limit 5;
'''

In [5]:
pd.read_sql(q,cnx)

Unnamed: 0,user_id,sum
0,00023iyk9l,867896.0
1,0010k6l0om,586543.0
2,001wyh0pz8,282965.0
3,0028jgx1x1,297010.0
4,002qnbzfs5,6487080.0


### Total sessions per user

In [6]:
q = '''
select user_id, count(1)
from sessions
group by user_id
limit 5;'''

In [7]:
pd.read_sql(q,cnx)

Unnamed: 0,user_id,count
0,00023iyk9l,40
1,0010k6l0om,63
2,001wyh0pz8,90
3,0028jgx1x1,31
4,002qnbzfs5,789


In [51]:
q = '''
select user_id, count(1)
from sessions
group by user_id
;'''

In [52]:
total_sessions = pd.read_sql(q,cnx)

### Total seconds on site per user per device

In [10]:
q = '''
select device_type, count(user_id)
from sessions
group by device_type
order by count desc;'''

In [11]:
pd.read_sql(q,cnx)

Unnamed: 0,device_type,count
0,Mac Desktop,3585886
1,Windows Desktop,2648521
2,iPhone,2096749
3,Android Phone,835991
4,iPad Tablet,681836
5,Android App Unknown Phone/Tablet,272820
6,-unknown-,210059
7,Tablet,139859
8,Linux Desktop,27968
9,Chromebook,22272


Let's only look at the total time per user for the top four devices: `Mac Desktop`, `Windows Desktop`, `iPhone`, `Android Phone`.

In [12]:
q = '''
select sessions.user_id, sessions2.total as Mac_Desktop, sessions3.total as iPhone, sessions4.total as Windows_Desktop, sessions5.total as Android_Phone
from sessions
full join (
    select user_id, device_type, sum(secs_elapsed) total
    from sessions
    where device_type = 'Mac Desktop'
    group by user_id, device_type
    limit 5) sessions2 on sessions.user_id = sessions2.user_id
full join (
    select user_id, device_type, sum(secs_elapsed) total
    from sessions 
    where device_type = 'iPhone'
    group by user_id, device_type
    limit 5) sessions3 on sessions2.user_id = sessions3.user_id
full join (
    select user_id, device_type, sum(secs_elapsed) total
    from sessions 
    where device_type = 'Windows Desktop'
    group by user_id, device_type
    limit 5) sessions4 on sessions3.user_id = sessions4.user_id
full join (
    select user_id, device_type, sum(secs_elapsed) total
    from sessions 
    where device_type = 'Android Phone'
    group by user_id, device_type
    limit 5) sessions5 on sessions4.user_id = sessions5.user_id    
;'''

In [13]:
pd.read_sql(q,cnx)

Unnamed: 0,user_id,mac_desktop,iphone,windows_desktop,android_phone
0,d1mm9tcy42,,,,
1,d1mm9tcy42,,,,
2,d1mm9tcy42,,,,
3,d1mm9tcy42,,,,
4,d1mm9tcy42,,,,
5,d1mm9tcy42,,,,
6,d1mm9tcy42,,,,
7,d1mm9tcy42,,,,
8,d1mm9tcy42,,,,
9,d1mm9tcy42,,,,


Verification:

In [31]:
q = '''
select user_id, device_type, sum(secs_elapsed)
from sessions
where user_id in ('0010k6l0om', '0035hobuyj', '00023iyk9l')
group by user_id, device_type;'''

In [32]:
pd.read_sql(q,cnx)

Unnamed: 0,user_id,device_type,sum
0,00023iyk9l,iPhone,572254.0
1,0010k6l0om,Mac Desktop,586543.0
2,00023iyk9l,Mac Desktop,295642.0
3,0035hobuyj,Mac Desktop,5724670.0


## Feature Engineering

In [53]:
sessions = pd.read_csv('sessions_cleaned.csv')

In [105]:
users = pd.read_csv('users_cleaned.csv')

In [106]:
users['date_account_created'] = pd.to_datetime(users['date_account_created'], infer_datetime_format = True)
users['timestamp_first_active'] = pd.to_datetime(users['timestamp_first_active'], format = '%Y-%m-%d %H:%M:%S')

In [107]:
users_dummied = pd.get_dummies(users[['gender','language','signup_method','signup_app','first_device_type','first_browser','affiliate_channel','affiliate_provider','first_affiliate_tracked']],drop_first = True)

Find the length of time between the account being created and the user being first active.

Positive -> user was active before they made their account

-1 -> First time the user was active was when they made their account

In [108]:
users_dummied['diff_account_to_first_active'] = (users['date_account_created']-users['timestamp_first_active']).dt.days

In [109]:

users_dummied['id'] = users['id']

Turn Target into binary classes (domestic v international)

In [110]:
country_destinations = pd.get_dummies(users['country_destination'])
y = country_destinations['US']

In [111]:
users_dummied

Unnamed: 0,gender_FEMALE,gender_MALE,gender_OTHER,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,...,affiliate_provider_yahoo,affiliate_provider_yandex,first_affiliate_tracked_local ops,first_affiliate_tracked_marketing,first_affiliate_tracked_omg,first_affiliate_tracked_product,first_affiliate_tracked_tracked-other,first_affiliate_tracked_untracked,diff_account_to_first_active,id
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,475,4ft3gnwmtx
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,764,bjjt8pjhuk
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,279,87mebub9p4
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,-1,osr2jwljor
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,-1,lsw9q7uk0j
5,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,-1,0d01nltbrs
6,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,-1,a1vcnhxeij
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,-1,6uh8zyj2gn
8,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,-1,yuuqmid2rp
9,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,-1,k6np330cm1


In [112]:
total_sessions.columns = ['id', 'count']

In [113]:
users_dummied = pd.merge(users_dummied, total_sessions, on='id')
users_dummied['age'] = users['age']

In [114]:
users = users_dummied.join(y)

In [115]:
users.age.isnull().value_counts()

False    21408
True      7366
Name: age, dtype: int64

In [117]:
with open('users.pkl', 'wb') as picklefile: # wb: write, binary
    pickle.dump(users, picklefile) #dump data into pickle file

In [116]:
users

Unnamed: 0,gender_FEMALE,gender_MALE,gender_OTHER,language_cs,language_da,language_de,language_el,language_en,language_es,language_fi,...,first_affiliate_tracked_marketing,first_affiliate_tracked_omg,first_affiliate_tracked_product,first_affiliate_tracked_tracked-other,first_affiliate_tracked_untracked,diff_account_to_first_active,id,count,age,US
0,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,-1,d1mm9tcy42,127,56.0,1
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,-1,4rvqpxoh3h,8,42.0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-1,xwxei6hdk4,7,41.0,1
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,-1,ro2stddszp,43,,1
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,-1,awiurksqr3,8,46.0,1
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,-1,ucgks2fyez,283,47.0,1
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-1,jrqykh9y8x,1169,50.0,1
7,0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,-1,s9xrwtyzsq,73,46.0,1
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-1,11581i5wng,20,36.0,1
9,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-1,oa8oz6sj6s,146,,0
