In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

### 1. Load Data

In [2]:
with open('/Users/ChunyanHao/desktop/github/ds_take_home/data/url_list.txt') as f:
    lines = f.readlines()
    
print('Total Records:\t', len(lines))

Total Records:	 77677


In [3]:
def parse_search(lines):
    """ function to parse user's search history """
    length = len(lines)
    names = ['checkin', 'checkout', 'customMinimumPriceFilter', 
             'customMaximumPriceFilter', 'freeCancellation', 'stars_5', 
             'stars_4', 'stars_3', 'stars_2', 'stars_1', 'max_score', 
             'min_score', 'couponCode', 'adults', 'city', 'children', 
             'amenities', 'search_page']
    
    maps = {}
    for name in names:
        maps[name] = [np.nan] * length
      
    for i in range(length):
        line = lines[i]
        items = line[50:].strip().split('&')
        visited = set()
        for item in items:
            key, value = item.strip().split('=')
            key = key.strip().split('.')[1]
            if key == 'city':
                value = value.strip().replace('+', ' ')
            if key not in visited:
                maps[key][i] = value
                visited.add(key)
            else:
                maps[key][i] = maps[key][i] + ', ' + value
            
    # transform into DataFrame
    df = pd.DataFrame(maps, columns=names)
    df['checkin'] = pd.to_datetime(df['checkin'])
    df['checkout'] = pd.to_datetime(df['checkout'])
    df = df.rename(columns={'customMinimumPriceFilter': 'MinPrice', 
                            'customMaximumPriceFilter': 'MaxPrice'})
    
    return df

In [4]:
data = parse_search(lines)
data.head(3)

Unnamed: 0,checkin,checkout,MinPrice,MaxPrice,freeCancellation,stars_5,stars_4,stars_3,stars_2,stars_1,max_score,min_score,couponCode,adults,city,children,amenities,search_page
0,2015-09-19,2015-09-20,,,,,yes,,,,,4,,3,"New York, NY, United States",,,1
1,2015-09-14,2015-09-15,,,,,,yes,,,,4,,3,"London, United Kingdom",,,1
2,2015-09-26,2015-09-27,,175.0,,,yes,,,,,5,,2,"New York, NY, United States",,,1


### 2. For each search query, how many amenities were selected?

In [5]:
data.amenities.value_counts()

internet                272
yes_smoking             170
shuttle                 111
yes_pet                  85
breakfast                39
lounge                   22
yes_smoking, yes_pet      4
breakfast, yes_pet        1
Name: amenities, dtype: int64

In [6]:
data['amenities'].apply(lambda x: 0 if pd.isnull(x)  else len(x.split(', '))).value_counts()

0    76973
1      699
2        5
Name: amenities, dtype: int64

### 3.

Often, to measure the quality of a search algorithm, data scientists use some metric based on how often users click on the second page, third page, and so on. The idea here is that a great search algorithm should return all interesting results on the first page and never force users to visit the other pages (how often do you click on the second page results when you search on Google? Almost never, right?).

Create a metric based on the above idea and find the city with the worst search algorithm.


In [7]:
100*data['search_page'].value_counts()/len(data['search_page'])

1     64.369118
2     14.981269
3      7.549210
4      4.679635
5      3.118040
6      2.106158
7      1.434144
8      0.952663
9      0.561299
10     0.248465
Name: search_page, dtype: float64

For each city, we can calculate the percentage of search with page = 1 as an idea metric:

In [8]:
data['page_1'] = data['search_page'] == '1'

In [9]:
data.groupby(['city'])['page_1'].mean()

city
Hong Kong, Hong Kong                        0.910826
London, United Kingdom                      0.526588
New York, NY, United States                 0.557616
San Francisco, California, United States    0.959285
Name: page_1, dtype: float64

It's clear that 'London, United Kingdom' has the lowest percentage of search with page = 1.