In [11]:
#check your environment
import sys
print(sys.executable)

/Users/abhati/anaconda/envs/krazyglue/bin/python


In [9]:
class Config:

    DATA_DIR = "./dataset/testdata/1monthdata"
    ANALYSIS_PATH = "./dataset"
    DESTINATION_PROPENSITY_PATH = ANALYSIS_PATH+"/destination_propensity.csv"
    ORIGIN_DESTINATION_PROPENSITY_PATH = ANALYSIS_PATH+"/origin_destination_propensity.csv"
    FEATURE_VECTOR_PATH = ANALYSIS_PATH+"/feature_vector.csv"
    SERIALIZED_MODEL_PATH = ANALYSIS_PATH+"/model.pmml"
    HOTEL_CONF_TRIPS_PATH = ANALYSIS_PATH+"/hotel-attach.csv"

    HOTEL_LOB = "H"
    CAR_LOB = "C"
    FLIGHT_LOB = "F"
    PHONE = "P"

    @staticmethod
    def read_data_idx():

        data_idx_map = {}
        data_idx_map["pst_dtm"] = 0
        data_idx_map["begin_use_date_key"] = 1
        data_idx_map["end_use_date_key"] = 2
        data_idx_map["trl"] = 3
        data_idx_map["lob"] = 4
        data_idx_map["international"] = 5
        data_idx_map["site_platform"] = 6
        data_idx_map["purch_lodg_property_key"] = 7
        data_idx_map["origin_TLA"] = 8
        data_idx_map["destination_TLA"] = 9
        data_idx_map["air_trip_type"] = 10
        data_idx_map["adults"] = 11
        data_idx_map["local_date"] = 12
        data_idx_map["expuserid"] = 13
        data_idx_map["hotel_TLA"] = 14

        return data_idx_map

In [10]:
import csv
import json
import os

'''
Print basic statistics about the data such as
how many users
how many transactions
users with trips
users with flight attaches
avg transactions per user
avg hotel transactions per user
avg car transactions per user
'''


class DataAnalysis:

    __users__ = 0
    __trans__ = 0
    __trips__ = 0
    __flight_trans__ = 0
    __hotel_trans__ = 0
    __car_trans__ = 0
    __flight_attach_trips__ = 0
    __flight_hotel_trips__ = 0
    __flight_car_trips__ = 0
    __hotel_hotel_trips__ = 0
    __hotel_car_trips__ = 0
    __hotel_attach_trips__ = 0

    HOTEL_LOB = "H"
    FLIGHT_LOB = "F"
    CAR_LOB = "C"
    data_idx_dict = Config.read_data_idx()

    def __init__(self):
        pass

    @classmethod
    def get_transaction_cnt(self, history, lob):
        count = 0
        for transaction in history:
            if(transaction[self.data_idx_dict["lob"]] == lob):
                count = count +1
        return count

    @classmethod
    def get_flight_attach_trip_cnt(self, trips):

        for trip in trips:
            if len(trip) > 1:
                hotel_cnt = self.get_transaction_cnt(trip, self.HOTEL_LOB)
                flight_cnt = self.get_transaction_cnt(trip, self.FLIGHT_LOB)
                car_cnt = self.get_transaction_cnt(trip, self.CAR_LOB)
                if flight_cnt > 0:
                    self.__flight_attach_trips__ += 1
                    if hotel_cnt > 0:
                        self.__flight_hotel_trips__ += 1
                    if car_cnt > 0:
                        self.__flight_car_trips__ += 1

    @classmethod
    def get_hotel_attach_trip_cnt(self, trips, writer):

        for trip in trips:
            if len(trip) > 1:
                hotel_cnt = self.get_transaction_cnt(trip, self.HOTEL_LOB)
                car_cnt = self.get_transaction_cnt(trip, self.CAR_LOB)
                if hotel_cnt > 0:
                    self.__hotel_attach_trips__ += 1
                    if hotel_cnt > 1:
                        self.__hotel_hotel_trips__ += 1
                        writer.writerow(trip)
                    if car_cnt > 0:
                        self.__hotel_car_trips__ += 1
                        writer.writerow(trip)

    @classmethod
    def print_counts(self):
        print "Users"
        print self.__users__
        print "Transactions"
        print self.__trans__
        print "Trips"
        print self.__trips__
        print "flight transactions"
        print self.__flight_trans__
        print "hotel transactions"
        print self.__hotel_trans__
        print "car transactions"
        print self.__car_trans__
        print "flight attach trips"
        print self.__flight_attach_trips__
        print "F-H trips"
        print self.__flight_hotel_trips__
        print "F-C trips"
        print self.__flight_car_trips__
        print "H-H trips"
        print self.__hotel_hotel_trips__
        print "H-C trips"
        print self.__hotel_car_trips__

    @classmethod
    def analyze_data(self, data_dir):
        path = Config.HOTEL_CONF_TRIPS_PATH

        with open(path, 'wb') as csv_file:
            writer = csv.writer(csv_file)

            for root, subdirs, filenames in os.walk(data_dir):
                for filename in filenames:
                    if not filename.startswith("part"):
                        continue
                    else:
                        print filename
                        with open(os.path.join(data_dir, filename), 'r') as handle:
                            json_data = [json.loads(line) for line in handle]

                            for user_record in json_data:
                                user = ["user"]
                                history = user_record["history"]
                                trips = user_record["trips"]

                                self.__users__ += 1
                                self.__trans__ += len(history)
                                self.__trips__ += len(trips)
                                self.__flight_trans__ += self.get_transaction_cnt(history, self.FLIGHT_LOB)
                                self.__hotel_trans__ += self.get_transaction_cnt(history, self.HOTEL_LOB)
                                self.__car_trans__ += self.get_transaction_cnt(history, self.CAR_LOB)
                                self.get_flight_attach_trip_cnt(trips)
                                self.get_hotel_attach_trip_cnt(trips, writer)

        with open(path,"r") as f:
            reader = csv.reader(f,delimiter = ",")
            data = list(reader)
            row_count = len(data)
        print row_count
        self.print_counts()

In [11]:
import csv
from datetime import datetime

'''
This class generates feature vectors from user data.
It uses previously calculated files for some features and uses flight
transaction data to predict the next one.
'''


class FeatureCalculator:
    data_idx_dict = Config.read_data_idx()
    destination_bookings = {}
    origin_destination_bookings = {}
    HOTEL_LOB = "H"
    CAR_LOB = "C"
    FLIGHT_LOB = "F"
    PHONE = "P"

    def __init__(self):
        pass

    @classmethod
    def load_destination_propensity(cls, destination_counts_file):
        with open(destination_counts_file, 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',')
            next(reader)
            for row in reader:
                destination_tla = row[0]
                hotel_propensity = float(row[1])
                car_propensity = float(row[2])

                cls.destination_bookings[destination_tla] = {cls.HOTEL_LOB: hotel_propensity, cls.CAR_LOB: car_propensity}

    @classmethod
    def add_destination_propensity(cls, feature_vector, destination_tla):
        if cls.destination_bookings.has_key(destination_tla):
            car_propensity = cls.destination_bookings[destination_tla][cls.CAR_LOB]
            hotel_propensity = cls.destination_bookings[destination_tla][cls.HOTEL_LOB]
            feature_vector.extend([car_propensity, hotel_propensity])
        else:
            feature_vector.extend([0.0, 0.0])


    @classmethod
    def advanced_purchase_window(cls, purchase_date, stay_start_date):
        start = datetime.strptime(purchase_date, "%Y-%m-%d %H:%M:%S.%f")
        start = start.date()
        end = datetime.strptime(stay_start_date, "%Y-%m-%d")
        end = end.date()
        difference = (end - start).days
        if difference < 0:
            # This means wrong search entry. The end date is earlier than start date.
            difference = -1
        return difference


    @classmethod
    def trip_length(self, beginDate, endDate):
        start = datetime.strptime(beginDate, "%Y-%m-%d")
        end = datetime.strptime(endDate, "%Y-%m-%d")
        difference = (end - start).days
        if difference < 0:
            # This means wrong search entry. The end date is earlier than start date.
            difference = -1
        return difference

    @classmethod
    def add_los(cls, feature_vector, transaction):
        los = cls.trip_length(transaction[cls.data_idx_dict["begin_use_date_key"]],
                              transaction[cls.data_idx_dict["end_use_date_key"]])
        #TODO: convert to one hot vector
        feature_vector.extend([los])

    @classmethod
    def add_advanced_purchase_window(cls, feature_vector, transaction):
        apw = cls.advanced_purchase_window(transaction[cls.data_idx_dict["pst_dtm"]],
                                           transaction[cls.data_idx_dict["begin_use_date_key"]])
        #TODO: convert to one hot vector
        feature_vector.extend([apw])

    @classmethod
    def add_site_platform(cls, feature_vector, transaction):
        site_platform = transaction[cls.data_idx_dict["site_platform"]]
        if site_platform == cls.PHONE:
            feature_vector.extend([0])
        else:
            feature_vector.extend([1])

    @classmethod
    def add_isinternational(cls, feature_vector, transaction):
        international_ind = transaction[cls.data_idx_dict["international"]]
        if international_ind == "I":
            feature_vector.extend([1])
        else:
            feature_vector.extend([0])

    @classmethod
    def add_trip_lob_booked(cls, feature_vector, trip, transaction_index, lob):

        for i in range(0, transaction_index):
            transaction = trip[i]
            if transaction[cls.data_idx_dict["lob"]] == lob:
                feature_vector.extend([1])
                return
        feature_vector.extend([0])


    @classmethod
    def get_transaction_index_in_history(cls, history, find_transaction):

        for i in range(0, len(history)):
            transaction = history[i]
            if transaction[cls.data_idx_dict["lob"]] == find_transaction[cls.data_idx_dict["lob"]] \
                    and transaction[cls.data_idx_dict["begin_use_date_key"]] == find_transaction[cls.data_idx_dict["begin_use_date_key"]]\
                    and transaction[cls.data_idx_dict["end_use_date_key"]] == find_transaction[cls.data_idx_dict["end_use_date_key"]] \
                    and transaction[cls.data_idx_dict["trl"]] == find_transaction[cls.data_idx_dict["trl"]]:
                return i

        return -1

    @classmethod
    def add_total_car_bookings(cls, feature_vector, history, transaction):

        index = cls.get_transaction_index_in_history(history, transaction)
        car_count = 0
        if index > -1:
            for i in range(0, index):
                transaction = history[i]
                if transaction[cls.data_idx_dict["lob"]] == cls.CAR_LOB:
                    car_count += 1
        feature_vector.extend([car_count])

    @classmethod
    def add_total_hotel_bookings(cls, feature_vector, history, transaction):

        index = cls.get_transaction_index_in_history(history, transaction)
        hotel_count = 0
        if index > -1:
            for i in range(0, index):
                transaction = history[i]
                if transaction[cls.data_idx_dict["lob"]] == cls.HOTEL_LOB:
                    hotel_count += 1
        feature_vector.extend([hotel_count])

    @classmethod
    def add_total_bookings(cls, feature_vector, history, transaction):

        index = cls.get_transaction_index_in_history(history, transaction)
        feature_vector.extend([index])

In [12]:
class OfflineData:
    destination_bookings = {}
    data_idx_dict = Config.read_data_idx()
    HOTEL_LOB = "H"
    CAR_LOB = "C"
    FLIGHT_LOB = "F"

    def __init__(self):
        pass

    @classmethod
    def __validate_TLA__(cls, tla):
        if tla.isalpha() and len(tla) == 3:
            return True
        else:
            return False

    @classmethod
    def __build_map__(cls, lob_propensity_map, key, lob):

        if not lob_propensity_map.has_key(key):
            lob_propensity_map[key] = {cls.HOTEL_LOB: 0, cls.CAR_LOB: 0, cls.FLIGHT_LOB: 0}
        lob_propensity_map[key][lob] += 1

    @classmethod
    def __record_destination_booking__(cls, history):
        for transaction in history:
            lob = transaction[cls.data_idx_dict["lob"]]
            key = transaction[cls.data_idx_dict["hotel_TLA"]] if lob == cls.HOTEL_LOB else transaction[cls.data_idx_dict["destination_TLA"]]

            if(cls.__validate_TLA__(key)):
                cls.__build_map__(cls.destination_bookings, key, lob)

    @classmethod
    def __write_destination_propensity__(cls, path, map):
        with open(path, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            for key, value in map.items():
                keys = key.split("-")
                hotel_cnt = value["H"]
                car_cnt = value["C"]
                flight_cnt = value["F"]
                hotel_probability = float(hotel_cnt) / float(hotel_cnt + car_cnt + flight_cnt)
                car_probability = float(car_cnt) / float(hotel_cnt + car_cnt + flight_cnt)
                writer.writerow([keys[0], "{:.4f}".format(hotel_probability), "{:.4f}".format(car_probability), hotel_cnt, car_cnt, flight_cnt])


    @classmethod
    def __write_as_csv__(cls, path, map):
        with open(path, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            for key, value in map.items():
                keys = key.split("-")
                hotel_cnt = value["H"]
                car_cnt = value["C"]
                flight_cnt = value["F"]
                hotel_probability = float(hotel_cnt) / float(hotel_cnt + car_cnt + flight_cnt)
                car_probability = float(car_cnt) / float(hotel_cnt + car_cnt + flight_cnt)

                if(len(keys) == 1):
                    writer.writerow([keys[0], "{:.4f}".format(hotel_probability), "{:.4f}".format(car_probability), hotel_cnt, car_cnt, flight_cnt])
                else:
                    writer.writerow([keys[0], keys[1], "{:.4f}".format(hotel_probability), "{:.4f}".format(car_probability), hotel_cnt, car_cnt, flight_cnt])



In [13]:
import json
import os

'''
Calculate offline data for Hotel confirmation page model
'''

class HotelOfflineData(OfflineData):


    def __init__(self):
        OfflineData.__init__(self)

    @classmethod
    def calculate_offline_data(cls, data_dir):
        for root, subdirs, filenames in os.walk(data_dir):
            for filename in filenames:
                if not filename.startswith("part"):
                    continue
                else:
                    print filename
                    with open(os.path.join(data_dir, filename), 'r') as handle:
                        json_data = [json.loads(line) for line in handle]

                        for user_record in json_data:
                            history = user_record["history"]
                            cls.__record_destination_booking__(history)

        cls.__write_as_csv__(Config.DESTINATION_PROPENSITY_PATH, cls.destination_bookings)
        print("Wrote to ")
        print Config.DESTINATION_PROPENSITY_PATH

In [14]:
import csv
import json
import os

class HotelConfFeatureCalculator(FeatureCalculator):

    def __init__(self):
        FeatureCalculator.__init__(self)

    @classmethod
    def load_offline_data(cls):
        cls.load_destination_propensity(Config.DESTINATION_PROPENSITY_PATH)

    # for each hotel attach transaction create feature vector
    @classmethod
    def get_feature_vector(cls, trip, user, history, hotel_transaction_index):
        feature_vector = []

        hotel_transaction = trip[hotel_transaction_index]

        trl = hotel_transaction[cls.data_idx_dict["trl"]]
        key = str(user) + "-" + str(trl)

        feature_vector.append(key)
        destination_TLA = hotel_transaction[cls.data_idx_dict["hotel_TLA"]]

        cls.add_destination_propensity(feature_vector, destination_TLA)

        cls.add_los(feature_vector, hotel_transaction)
        cls.add_advanced_purchase_window(feature_vector, hotel_transaction)
        cls.add_site_platform(feature_vector, hotel_transaction)

        cls.add_isinternational(feature_vector, hotel_transaction)
        cls.add_trip_lob_booked(feature_vector, trip, hotel_transaction_index, cls.CAR_LOB)
        cls.add_trip_lob_booked(feature_vector, trip, hotel_transaction_index, cls.FLIGHT_LOB)
        cls.add_trip_lob_booked(feature_vector, trip, hotel_transaction_index, cls.HOTEL_LOB)

        return feature_vector

    '''
    Process data to produce feature vectors. Write them to disk.
    '''
    @classmethod
    def calculate_feature_vector(cls, data_dir, files_to_process):
        cls.load_offline_data()

        file_count = 0

        with open(Config.FEATURE_VECTOR_PATH, 'wb') as feature_file:
            feature_writer = csv.writer(feature_file)
            header = ["id",
                      "car_destination_probability",
                      "hotel_destination_probability",
                      "length_of_stay_days",
                      "advanced_purchase_window_days",
                      "booking_platform",
                      "is_international",
                      "is_car_booked_for_trip",
                      "is_flight_booked_for_trip",
                      "is_hotel_booked_for_trip",
                      "dv"]
            feature_writer.writerow(header)
            for root, subdirs, filenames in os.walk("/Users/abhati/Github/krazymachine/testdata/1monthdata"):
                for filename in filenames:
                    if not filename.startswith("part"):
                        continue
                    else:
                        if file_count >= files_to_process:
                            return
                        else:
                            file_count += 1
                            with open(os.path.join(data_dir, filename), 'r') as handle:
                                json_data = [json.loads(line) for line in handle]

                                for user_record in json_data:
                                    user = user_record["user"]
                                    history = user_record["history"]
                                    trips = user_record["trips"]

                                    for trip in trips:
                                        for i in range(0, len(trip) - 1):
                                            current_transaction = trip[i]
                                            next_transaction = trip[i + 1]
                                            if current_transaction[cls.data_idx_dict["lob"]] == cls.HOTEL_LOB:
                                                if next_transaction[cls.data_idx_dict["lob"]] != cls.FLIGHT_LOB:
                                                    feature_vector = cls.get_feature_vector(trip, user, history, i)

                                                    if next_transaction[cls.data_idx_dict["lob"]] == cls.HOTEL_LOB:
                                                        feature_vector.extend([0])
                                                    else:
                                                        feature_vector.extend([1])

                                                    feature_writer.writerow(feature_vector)

In [15]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn2pmml import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

class GenerateModel:
    # predicts all hotels
    def base_rate_model(cls, X):
        y = np.zeros(X.shape[0])
        return y


    @classmethod
    def _build_logistic_regression_model_(cls, X_train, y_train):
        pmml_pipeline = PMMLPipeline([
            ("classifier", LogisticRegression(penalty='l2', C=1))
        ])

        pmml_pipeline.fit(X_train, y_train)

        return pmml_pipeline

    @classmethod
    def _compute_f1_score(cls, classifier, X_val, y_val):
        y_pred = classifier.predict(X_val)

        precision, recall, f1, dummy = precision_recall_fscore_support(y_val, y_pred, average='micro')

        print("Precision is %2.6f" % precision)
        print("Recall is %2.6f" % recall)
        print("F1 score is %2.6f" % f1)

        print precision_recall_fscore_support(y_val, y_pred, average=None)

    @classmethod
    def _confusion_matrix(cls, classifier, X_val, y_val):
        y_pred = classifier.predict(X_val)
        # Compute confusion matrix
        cm = confusion_matrix(y_val, y_pred)
        np.set_printoptions(precision=2, suppress=True)
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print('Normalized confusion matrix')
        print(cm_normalized)
        print ('Actual confusion matrix counts')
        print(cm)


    @classmethod
    def _serialize_model_(cls, pmml_model, path):

        sklearn2pmml(pmml_model, path, with_repr=True)


    @classmethod
    def _evaluate_model_(cls, model, X_test, y_test):

        print("Logistic accuracy is %2.2f" % accuracy_score(y_test, model.predict(X_test)))


    @classmethod
    def generate_model(cls, feature_vector_path, serialized_model_path):
        df = pd.read_csv(feature_vector_path)
        X = pd.DataFrame()
        X['car_destination_probability'] = df['car_destination_probability']
        X['hotel_destination_probability'] = df['hotel_destination_probability']
        X['length_of_stay_days'] = df['length_of_stay_days']
        X['advanced_purchase_window_days'] = df['advanced_purchase_window_days']
        X['booking_platform'] = df['booking_platform']
        X['is_international'] = df['is_international']
        X['is_car_booked_for_trip'] = df['is_car_booked_for_trip']
        X['is_flight_booked_for_trip'] = df['is_flight_booked_for_trip']
        X['is_hotel_booked_for_trip'] = df['is_hotel_booked_for_trip']

        y = df['dv']


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

        model = cls._build_logistic_regression_model_(X_train, y_train)
        cls._evaluate_model_(model, X_test, y_test)
        cls._compute_f1_score(model, X_test, y_test)
        cls._confusion_matrix(model, X_test, y_test)
        cls._serialize_model_(model, serialized_model_path)

        return model


In [16]:
def main():
    analyze(Config.DATA_DIR)
    generate_hotel_offline_data(Config.DATA_DIR)
    files_to_process = 10
    hotel_feature_calculator(Config.DATA_DIR, files_to_process)
    generate_model(Config.FEATURE_VECTOR_PATH, Config.SERIALIZED_MODEL_PATH)


def generate_hotel_offline_data(data_dir):
    hotel_offline_data = HotelOfflineData()
    hotel_offline_data.calculate_offline_data(data_dir)


def analyze(data_dir):
    data_analysis = DataAnalysis()
    data_analysis.analyze_data(data_dir)


def hotel_feature_calculator(data_dir, files_to_process):
    features = HotelConfFeatureCalculator()
    features.calculate_feature_vector(data_dir, files_to_process)


def generate_model(feature_vector_path, serialized_model_path):
    create_model = GenerateModel()
    create_model.generate_model(feature_vector_path, serialized_model_path)

if __name__ == '__main__':
    main()

part-00001
part-00002
part-00003
part-00004
part-00005
part-00006
part-00007
part-00008
part-00009
part-00010
17406
Users
184706
Transactions
272138
Trips
44083
flight transactions
113162
hotel transactions
127267
car transactions
31709
flight attach trips
27419
F-H trips
5716
F-C trips
3673
H-H trips
13967
H-C trips
3439
part-00001
part-00002
part-00003
part-00004
part-00005
part-00006
part-00007
part-00008
part-00009
part-00010
Wrote to 
./dataset/destination_propensity.csv
Logistic accuracy is 0.90
Precision is 0.903335
Recall is 0.903335
F1 score is 0.903335
(array([0.90597205, 0.125     ]), array([0.99673812, 0.0044843 ]), array([0.94919015, 0.00865801]), array([2146,  223]))
Normalized confusion matrix
[[1. 0.]
 [1. 0.]]
Actual confusion matrix counts
[[2139    7]
 [ 222    1]]
