In [1]:
"""
Importing required tools for analysis

"""

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from datetime import timedelta
from operator import itemgetter

import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import f_oneway
from s2sphere import CellId, LatLng

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV
import xgboost as xgb


import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns

import tensorflow as tf

%matplotlib inline

sns.set_style("white", {'ytick.major.size': 10.0})
sns.set_context("poster", font_scale=1.1)

In [2]:
"""
Loading the data and removing anomalies, duplicates etc. 
"""

def get_data():
    data = pd.read_csv("TH_data_challenge.tsv",sep='\t').drop_duplicates()
    data = data[data['m_effective_daily_price'] > 0.0]
    return data


def get_na_distribution():
    data = get_data()
    data_nan = (data.isnull().sum() / data.shape[0]) * 100
    print data_nan

def print_columns():
    data = get_data()
    columns = data.columns
    print columns.values

#print_columns()
#get_na_distribution()

In [3]:
'''
Feature Engineering
'''

def get_label_distribution():
    data = get_data()
    #label_data  = data['dim_is_requested']
    #city_distribution = data['dim_market']
    #print city_distribution.describe()
    la_label = data[data['dim_market']=='Los Angeles']['dim_is_requested']
    paris_label = data[data['dim_market']=='Paris']['dim_is_requested']
    sf_label = data[data['dim_market']=='San Francisco']['dim_is_requested']
    print la_label.describe()
    print paris_label.describe()
    print sf_label.describe()

    
def transform_date_to_day(date_col):
    t_date = pd.DatetimeIndex(date_col)
    t_date = t_date.weekday
    return t_date

def find_holiday_on_booking_date(date_col):
    dr = pd.date_range(start='2015-01-01', end='2015-12-31')
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    t_date = pd.to_datetime(pd.Series(date_col))
    df = pd.DataFrame()
    df['date'] = t_date
    df['is_holiday'] = df['date'].isin(holidays)
    return df.is_holiday

def find_nearby_holiday_on_booking_date(date_col):
    dr = pd.date_range(start='2015-01-01', end='2015-12-31')
    cal = calendar()
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    t_date = pd.to_datetime(pd.Series(date_col))
    df = pd.DataFrame()
    df['date'] = t_date
    df['2_post_dates'] = df['date'] + timedelta(days=2)
    df['1_post_date'] = df['date'] + timedelta(days=1)
    df['2_pre_dates'] = df['date']  + timedelta(days=2)
    df['1_pre_date'] = df['date']  + timedelta(days=1)
    df['is_holiday_or_nearby'] = df['date'].isin(holidays) | df['2_post_dates'].isin(holidays) | df['1_post_date'].isin(holidays) | df['2_pre_dates'].isin(holidays) | df['1_pre_date'].isin(holidays)
    return df.is_holiday_or_nearby

def cluster_locality_and_value_data(data):
    index = data.index
    locality_data = data[['dim_lat', 'dim_lng', 'm_effective_daily_price', 'dim_person_capacity', 'm_total_overall_rating', 'm_reviews', 'm_professional_pictures']].fillna(0)
    kmeans = KMeans(n_clusters=4)
    locality_data = locality_data.values.tolist()
    kmeans.fit(locality_data)
    clusters = pd.Series(kmeans.predict(locality_data), index=index)
    one_hot_clusters = pd.get_dummies(clusters, dummy_na = True)
    one_hot_clusters.drop(np.nan,1,inplace=True)
    return one_hot_clusters
    
    

def s2_cell_id_spatial_data(lat_lng_data):
    index = lat_lng_data.index
    lat_data = lat_lng_data['dim_lat'].values.tolist()
    lng_data = lat_lng_data['dim_lng'].values.tolist()
    s2_data_cell_id = []
    for lat,lng in zip(lat_data, lng_data):
        cell_id = CellId.from_lat_lng(LatLng.from_degrees(lat, lng)).id()
        s2_data_cell_id.append(cell_id)
    min_max_scaler = preprocessing.MinMaxScaler()
    s2_data_cell_id = min_max_scaler.fit_transform(s2_data_cell_id).tolist()
    s2_data_cell_id = pd.DataFrame(s2_data_cell_id, columns=['s2_cell_id'], index=index)
    return s2_data_cell_id

def price_and_night_feature(data):
    index = data.index
    list_price_la = data[data['dim_market'] == 'Los Angeles']['m_effective_daily_price']
    list_price_p = data[data['dim_market'] == 'Paris']['m_effective_daily_price']
    list_price_sf = data[data['dim_market'] == 'San Francisco']['m_effective_daily_price']
    
    list_price_la_norm = (list_price_la - list_price_la.mean()) / (list_price_la.max() - list_price_la.min())
    list_price_p_norm = (list_price_p - list_price_p.mean()) / (list_price_p.max() - list_price_p.min())
    list_price_sf_norm = (list_price_sf - list_price_sf.mean()) / (list_price_sf.max() - list_price_sf.min())
    list_price = pd.concat([list_price_la_norm,list_price_p_norm,list_price_sf_norm])
    
    pn_feat = data[['m_effective_daily_price', 'm_pricing_cleaning_fee', 'price_booked_most_recent']]
    #pn_feat.price_booked_most_recent.fillna(1.0029*pn_feat.m_effective_daily_price, inplace=True)
    #pn_feat.price_booked_most_recent.fillna(1.05*pn_feat.m_effective_daily_price, inplace=True)
    #pn_feat.price_booked_most_recent.fillna(pn_feat.price_booked_most_recent.mean(), inplace=True)
    pn_feat.price_booked_most_recent.fillna(0, inplace=True)
    
    pn_feat['market_norm_price'] = list_price
    
    days_on_booking_night = transform_date_to_day(data['ds_night'])
    curr_days = transform_date_to_day(data['ds'])
    is_nearby_holiday = find_nearby_holiday_on_booking_date(data['ds_night']).astype(int)
    is_holiday = find_holiday_on_booking_date(data['ds_night']).astype(int)
    
    is_nearby_holiday = pd.Series(is_nearby_holiday, index=index)
    is_holiday = pd.Series(is_holiday, index=index)
    curr_days = pd.Series(curr_days, index=index)
    days_on_booking_night = pd.Series(days_on_booking_night, index=index)
    
    #one_hot_book_days = pd.get_dummies(days_on_booking_night)
    #one_hot_book_days.columns = ['day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6']
    #print days_on_booking_night.columns
    #one_hot_book_days = one_hot_book_days.rename(columns={'0': 'day_0', '1': 'day_1','2': 'day_2','3': 'day_3','4': 'day_4','5': 'day_5','6': 'day_6' })
    #one_hot_curr_days = pd.get_dummies(curr_days)
    #one_hot_curr_days = curr_days.rename(columns={'0': 'curr_day_0', '1': 'curr_day_1','2': 'curr_day_2','3': 'curr_day_3','4': 'curr_day_4','5': 'curr_day_5','6': 'curr_day_6' })
    #one_hot_book_days.columns = ['curr_day_0', 'curr_day_1', 'curr_day_2', 'curr_day_3', 'curr_day_4', 'curr_day_5', 'curr_day_6']
    
    days_on_booking_night = pd.DataFrame(days_on_booking_night,columns=['days_on_booking_night'])
    curr_days = pd.DataFrame(curr_days,columns=['curr_days'])
    is_nearby_holiday = pd.DataFrame(is_nearby_holiday,columns=['is_nearby_holiday'])
    is_holiday = pd.DataFrame(is_holiday,columns=['is_holiday'])

    pn_feat = pd.concat([pn_feat, days_on_booking_night, curr_days, is_holiday], axis=1)
    
    return pn_feat
    
    
    

def listing_level_feature(data):
    ls_feat = data[['dim_lat','dim_lng', 'dim_person_capacity','dim_is_instant_bookable','m_checkouts', 'm_reviews',
              'days_since_last_booking','cancel_policy','image_quality_score', 'm_total_overall_rating','m_professional_pictures',
              'dim_has_wireless_internet']]#.fillna(0)
    
    ls_feat = ls_feat.fillna(ls_feat.mean())
    
    room_type = pd.get_dummies(data['dim_room_type'])
    lat_lng_s2_nromalized_cell = s2_cell_id_spatial_data(data[['dim_lat', 'dim_lng']])
    
    out = pd.concat([ls_feat, room_type], axis=1)
    out = pd.concat([out, lat_lng_s2_nromalized_cell], axis=1)
    return out


def occupancy_level_features(data):
    oc_feat = data[['ds_night_day_of_week', 'ds_night_day_of_week', 'ds_checkin_gap', 'ds_checkout_gap', 'occ_occupancy_plus_minus_7_ds_night',
                   'occ_occupancy_plus_minus_14_ds_night', 'occ_occupancy_trailing_90_ds', 'm_minimum_nights', 'm_maximum_nights']]#.fillna(0)
    
    oc_feat = oc_feat.apply(lambda x: x.fillna(x.median()),axis=0)
    return oc_feat
    

def demand_level_features(data):
    dl_feat = data[['price_booked_most_recent', 'p2_p3_click_through_score', 'p3_inquiry_score', 'listing_m_listing_views_2_6_ds_night_decay', 
                    'general_market_m_unique_searchers_0_6_ds_night', 'general_market_m_contacts_0_6_ds_night', 
                    'general_market_m_reservation_requests_0_6_ds_night', 'm_available_listings_ds_night']]#.fillna(0)
    
    #print dl_feat.describe()
    #dl_feat = dl_feat.fillna(0)
    dl_feat = dl_feat.apply(lambda x: x.fillna(x.mean()),axis=0)
    return dl_feat

def kdt_features(data):
    kdt_feat = data[['kdt_score', 'r_kdt_listing_views_0_6_avg_n100', 'r_kdt_n_active_n100', 'r_kdt_n_available_n100', 
                     'r_kdt_m_effective_daily_price_n100_p50', 'r_kdt_m_effective_daily_price_available_n100_p50',
                    'r_kdt_m_effective_daily_price_booked_n100_p50']]
    
    #print kdt_feat.describe()
    #kdt_feat = kdt_feat.fillna(0)
    kdt_feat = kdt_feat.apply(lambda x: x.fillna(x.mean()),axis=0)
    one_hot_clusters = cluster_locality_and_value_data(data)
    kdt_feat = pd.concat([kdt_feat, one_hot_clusters], axis=1)
    return kdt_feat





In [8]:
'''
MLP Modelling (One of the models out of all the ones being tested)
'''

def multi_layer_perceptron(X,Y):

    X = np.array(X)
    Y = np.array(Y)
    Y = np.array([Y, -(Y-1)]).T
    X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.30)

    # Parameters
    learning_rate = 0.0001
    training_epochs = 500
    batch_size = 500
    display_step = 1

    # Network Parameters
    n_hidden_1 = 70 # 1st layer number of features
    n_hidden_2 = 70 # 2nd layer number of features
    #n_hidden_3 = 50 # 2nd layer number of features
    #n_hidden_4 = 20 # 2nd layer number of features
    n_input = len(X[0]) # Number of feature
    n_classes = 2 # Number of classes to predict

    # tf Graph input
    x = tf.placeholder("float", [None, n_input])
    y = tf.placeholder("float", [None, n_classes])

    # Create model
    def multilayer_perceptron(x, weights, biases):
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
        layer_1 = tf.nn.relu(layer_1)
        # Hidden layer with RELU activation
        layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
        layer_2 = tf.nn.relu(layer_2)

        # Hidden layer with RELU activation
        #layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
        #layer_3 = tf.nn.relu(layer_3)
        # Hidden layer with RELU activation
        #layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
        #layer_4 = tf.nn.relu(layer_4)

        # Output layer with linear activation
        out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
        #out_layer = tf.matmul(layer_4, weights['out']) + biases['out']
        return out_layer


    # Store layers weight & bias
    weights = {
        'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
        'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
        #'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
        #'h4': tf.Variable(tf.random_normal([n_hidden_3, n_hidden_4])),
        'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes])),
        #'out': tf.Variable(tf.random_normal([n_hidden_4, n_classes]))
    }
    biases = {
        'b1': tf.Variable(tf.random_normal([n_hidden_1])),
        'b2': tf.Variable(tf.random_normal([n_hidden_2])),
        #'b3': tf.Variable(tf.random_normal([n_hidden_3])),
        #'b4': tf.Variable(tf.random_normal([n_hidden_4])),
        'out': tf.Variable(tf.random_normal([n_classes]))
    }

    pred = multilayer_perceptron(x, weights, biases)

    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    init = tf.initialize_all_variables()
    with tf.Session() as sess:
        sess.run(init)
        for epoch in range(training_epochs):
            avg_cost = 0.
            total_batch = int(len(X)/batch_size)
            X_batches = np.array_split(X, total_batch)
            Y_batches = np.array_split(Y, total_batch)
            # Loop over all batches
            for i in range(total_batch):
                batch_x, batch_y = X_batches[i], Y_batches[i]
                # batch_y.shape = (batch_y.shape[0], 1)
                # Run optimization op (backprop) and cost op (to get loss value)
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
                                                              y: batch_y})
                # Compute average loss
                avg_cost += c / total_batch
            # Display logs per epoch step
            if epoch % display_step == 0:
                print "Epoch:", '%04d' % (epoch+1), "cost=", \
                    "{:.9f}".format(avg_cost)
        print "Optimization Finished!"

        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        print "Accuracy:", accuracy.eval({x: X_test, y: Y_test})
        global result 
        result = tf.argmax(pred, 1).eval({x: X_test, y: Y_test})

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        print(sess.run(accuracy, feed_dict={x: X_test, y: Y_test}))

        y_p = tf.argmax(pred, 1)
        val_accuracy, y_pred = sess.run([accuracy, y_p], feed_dict={x:X_test, y:Y_test})

        Y_test = [not i for i in Y_test[:,0]]    
        y_pred =  y_pred.tolist()

        printing_metrics(Y_test, y_pred)

In [12]:
'''
Model Building
'''

def get_label_and_features():
    data = get_data()
    #data = data[data['dim_market']=='Los Angeles']
    #data = data[data['dim_market']=='Paris']
    #data = data[data['dim_market']=='San Francisco']
    market = data['dim_market']
    market = pd.get_dummies(market)
    
    ls_feat = listing_level_feature(data)
    os_feat = occupancy_level_features(data)
    dl_feat = demand_level_features(data)
    kdt_feat = kdt_features(data)
    pn_feat = price_and_night_feature(data)
    X = pd.concat([pn_feat, ls_feat, os_feat, dl_feat, kdt_feat], axis=1)
    X = pd.concat([X,market], axis=1)
    
    #X_is_nan = (X.isnull().sum() / X.shape[0]) * 100
    #print X_is_nan
    features = X.columns
    Y = data['dim_is_requested'].astype(int)
    Y = Y.values#[:1000] 
    X = X.values#[:1000]
    print X.shape
    print Y.shape
    return X,Y,features

def printing_metrics(Y_test, Y_pred):
    
    (test_precision,test_recall,test_fscore,test_support)=precision_recall_fscore_support(Y_test, Y_pred, beta=1.0, labels=None,
	                                pos_label=1, average=None,
	                                warn_for=('precision', 'recall',
	                                          'f-score'),
	                                sample_weight=None)
    
    print accuracy_score(Y_test, Y_pred)
    print test_precision
    print test_recall
    print test_fscore


def gradient_boosting_model():
    #cross validation and finding best params
    cv_params = {'max_depth': [1,3,5,7,9,11], 'min_child_weight': [1,3,5]}
    ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
                 'objective': 'binary:logistic'}
    
    
    #best params
    cv_params = {'learning_rate': [0.1], 'subsample': [0.9]}
    ind_params = {'n_estimators': 1000, 'seed':0, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'min_child_weight': 1, 'max_depth': 9}
    
    best_params = {'learning_rate': 0.1, 'seed':0, 'subsample': 0.9, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic', 'max_depth':9, 'min_child_weight':1}

    #model = GradientBoostingClassifier()
    model = GridSearchCV(xgb.XGBClassifier(**ind_params), cv_params, scoring = 'accuracy', cv = 2, n_jobs = -1)
    #model = xgb.XGBClassifier()
    #model.set_params(**best_params)
    
    # After training GridSearch:
    #print model.grid_scores_
    return model


def plot_feature_importance(model, features):
    '''
    %matplotlib inline
    import seaborn as sns
    sns.set(font_scale = 1.5)
    xgb.plot_importance(model)
    '''
    # feature importance
    feature_imp = model.feature_importances_
    LABELS = range(len(feature_imp))
    feat_imp_map = {}
    
    for feat,imp in zip(features, feature_imp):
        #print feat, imp
        feat_imp_map[feat] = imp
        
    sorted_feat_imp = sorted(feat_imp_map.items(), key=itemgetter(1), reverse=True)
    top_feat = []
    top_feat_val = []
    for pair in sorted_feat_imp:
        print pair[0], pair[1]
        top_feat.append(pair[0])
        top_feat_val.append(pair[1])
    
    top_feat = top_feat[:10]
    top_feat_val = top_feat_val[:10]
    
    # plot
    LABELS = range(len(top_feat_val))
    pyplot.bar(LABELS, top_feat_val)
    pyplot.xticks(LABELS, top_feat)
    pyplot.show()

    
def build_classifier():
    X,Y,features = get_label_and_features()
    X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    
    #multi_layer_perceptron(X,Y)
    #model = LogisticRegression()
    #model = SGDClassifier()
    #model = SVC()
    #model = LinearSVC()
    #model = RandomForestClassifier()
    model = gradient_boosting_model()
    
    model = model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    printing_metrics(Y_test, Y_pred)
    

build_classifier()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(184086, 54)
(184086,)
0.889863653648
[ 0.90404258  0.85770084]
[ 0.93511172  0.79758957]
[ 0.91931472  0.82655374]
