# Data Sience Project - Task 4

### Spark with Clustering (task 3 in spark)

In [1]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('clustering_in_spark').getOrCreate()

#### Read data from csv

In [3]:
df=spark.read.csv('Hotels_data_Changed.csv',inferSchema=True,header=True)

In [4]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Snapshot ID: integer (nullable = true)
 |-- Snapshot Date: timestamp (nullable = true)
 |-- Checkin Date: timestamp (nullable = true)
 |-- Days: integer (nullable = true)
 |-- Original Price: integer (nullable = true)
 |-- Discount Price: integer (nullable = true)
 |-- Discount Code: integer (nullable = true)
 |-- Available Rooms: integer (nullable = true)
 |-- Hotel Name: string (nullable = true)
 |-- Hotel Stars: integer (nullable = true)
 |-- DayDiff: integer (nullable = true)
 |-- WeekDay: string (nullable = true)
 |-- DiscountDiff: integer (nullable = true)
 |-- DiscountPerc: double (nullable = true)



## 4.a

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DateType, IntegerType
from pyspark.sql.functions import desc

#### Get the 150 hotels with maximum records

In [6]:
# group by hotel name and count,with sortig 
hotel_names = df.groupBy('Hotel Name').count().sort(desc('count')).limit(150)

In [7]:
hotel_names

DataFrame[Hotel Name: string, count: bigint]

#### Get list of the first (max) 150 hotels

In [None]:
maxHotel_names = hotel_names.toPandas()['Hotel Name'].tolist()

In [None]:
maxHotel_names

['Newark Liberty International Airport Marriott',
 'Hilton Garden Inn Times Square',
 'Residence Inn Newark Elizabeth Liberty International Airport',
 'Westin New York at Times Square',
 'Loews Regency New York Hotel',
 'Viceroy New York',
 'Four Seasons Hotel New York',
 'Langham Place New York Fifth Avenue',
 'The Carlyle A Rosewood Hotel',
 'DoubleTree by Hilton Metropolitan - New York City',
 'Magnuson Convention Center Hotel',
 'Hilton Garden Inn New York West 35th Street',
 'Hilton Garden Inn New York-Times Square Central',
 'Conrad New York',
 'Wyndham Garden Brooklyn Sunset Park',
 'Hilton Newark Airport',
 'Omni Berkshire Place',
 'Hilton Times Square',
 'Park Hyatt New York',
 'Homewood Suites by Hilton NY Midtown Manhattan Times Square',
 'Grand Hyatt New York',
 'The Plaza Hotel',
 'Quality Inn Woodside',
 'Hyatt Union Square New York',
 'Le Parker Meridien New York',
 'The New York EDITION',
 'W New York - Union Square',
 'Renaissance Newark Airport Hotel',
 'Hampton Inn N

#### Adding and filtering the other features

In [None]:
hotels_data = df.filter(col('Hotel Name').isin(maxHotel_names)).distinct()

#### Check

In [None]:
hotels_data

DataFrame[_c0: int, Snapshot ID: int, Snapshot Date: timestamp, Checkin Date: timestamp, Days: int, Original Price: int, Discount Price: int, Discount Code: int, Available Rooms: int, Hotel Name: string, Hotel Stars: int, DayDiff: int, WeekDay: string, DiscountDiff: int, DiscountPerc: double]

In [None]:
hotels_data.show()

+----+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+---------+------------+------------------+
| _c0|Snapshot ID|      Snapshot Date|       Checkin Date|Days|Original Price|Discount Price|Discount Code|Available Rooms|          Hotel Name|Hotel Stars|DayDiff|  WeekDay|DiscountDiff|      DiscountPerc|
+----+-----------+-------------------+-------------------+----+--------------+--------------+-------------+---------------+--------------------+-----------+-------+---------+------------+------------------+
| 168|          1|2015-07-17 00:00:00|2015-08-10 00:00:00|   5|          1193|          1102|            1|             -1|Hampton Inn Manha...|          3|     24|   Monday|          91| 7.627829002514669|
| 179|          1|2015-07-17 00:00:00|2015-08-19 00:00:00|   5|          1500|          1415|            1|             -1|Courtyard by Marr...|          4|     33|Wednesda

## 4.b

#### Get the 40 hotels with maximum checkin dates

In [None]:
# group by checkin and count, with sorting
checkin = hotels_data.groupBy('Checkin Date').count().sort(desc('count')).limit(40)

In [None]:
checkin.show()

+-------------------+-----+
|       Checkin Date|count|
+-------------------+-----+
|2015-11-11 00:00:00| 2302|
|2015-10-14 00:00:00| 1887|
|2015-11-04 00:00:00| 1885|
|2015-08-19 00:00:00| 1883|
|2015-10-28 00:00:00| 1861|
|2015-10-21 00:00:00| 1817|
|2015-11-06 00:00:00| 1808|
|2015-08-12 00:00:00| 1765|
|2015-11-05 00:00:00| 1684|
|2015-10-22 00:00:00| 1662|
|2015-11-12 00:00:00| 1649|
|2015-10-29 00:00:00| 1623|
|2015-09-10 00:00:00| 1623|
|2015-09-09 00:00:00| 1616|
|2015-11-18 00:00:00| 1582|
|2015-08-26 00:00:00| 1559|
|2015-11-10 00:00:00| 1548|
|2015-11-13 00:00:00| 1547|
|2015-10-15 00:00:00| 1473|
|2015-11-21 00:00:00| 1469|
+-------------------+-----+
only showing top 20 rows



#### get list of the first (max) 40 checkin dates

In [None]:
maxCheckin = checkin.toPandas()['Checkin Date'].tolist()

In [None]:
maxCheckin

[Timestamp('2015-11-11 00:00:00'),
 Timestamp('2015-10-14 00:00:00'),
 Timestamp('2015-11-04 00:00:00'),
 Timestamp('2015-08-19 00:00:00'),
 Timestamp('2015-10-28 00:00:00'),
 Timestamp('2015-10-21 00:00:00'),
 Timestamp('2015-11-06 00:00:00'),
 Timestamp('2015-08-12 00:00:00'),
 Timestamp('2015-11-05 00:00:00'),
 Timestamp('2015-10-22 00:00:00'),
 Timestamp('2015-11-12 00:00:00'),
 Timestamp('2015-10-29 00:00:00'),
 Timestamp('2015-09-10 00:00:00'),
 Timestamp('2015-09-09 00:00:00'),
 Timestamp('2015-11-18 00:00:00'),
 Timestamp('2015-08-26 00:00:00'),
 Timestamp('2015-11-10 00:00:00'),
 Timestamp('2015-11-13 00:00:00'),
 Timestamp('2015-10-15 00:00:00'),
 Timestamp('2015-11-21 00:00:00'),
 Timestamp('2015-09-30 00:00:00'),
 Timestamp('2015-10-30 00:00:00'),
 Timestamp('2015-09-16 00:00:00'),
 Timestamp('2015-09-17 00:00:00'),
 Timestamp('2015-11-28 00:00:00'),
 Timestamp('2015-10-01 00:00:00'),
 Timestamp('2015-11-26 00:00:00'),
 Timestamp('2015-09-11 00:00:00'),
 Timestamp('2015-09-

#### Adding and filtering the other features

In [None]:
hotelsCheckin = hotels_data.filter(col('Checkin Date').isin(maxCheckin)).distinct()

#### Check

In [None]:
hotelsCheckin

DataFrame[_c0: int, Snapshot ID: int, Snapshot Date: timestamp, Checkin Date: timestamp, Days: int, Original Price: int, Discount Price: int, Discount Code: int, Available Rooms: int, Hotel Name: string, Hotel Stars: int, DayDiff: int, WeekDay: string, DiscountDiff: int, DiscountPerc: double]

In [None]:
hotelsCheckin= hotelsCheckin.groupBy('Hotel name','Checkin Date', 'Discount Code').min('Discount Price')


In [None]:
hotelsCheckin.show()

## 4.c

#### We need to take for each checkin the 4 prices for the 4 discount codes, for this we need to make a new table and do join with the table that we already have

In [None]:
#the unique hotels name
names=hotelsCheckin.select('Hotel Name').distinct().collect()

In [None]:
#the unique hotels name- list
namesList= [(row['Hotel Name']) for row in names]

In [None]:
namesList

In [None]:
#the unique checking date
checkins=hotelsCheckin.select('Checkin Date').distinct().collect()

In [None]:
#the unique checking date- list
checkinsList= [(row['Checkin Date']) for row in checkins]

In [None]:
checkinsList

In [None]:
#the unique discount code
uniqueCode= [1,2,3,4]

#### Build the table that will help us insert the 4 codes for each checkin

In [None]:
helpTable=[]
for x in namesList:
    for y in checkinsList:
        for z in uniqueCode:
            helpTable.append([x, y ,z,-1])

In [None]:
len(checkinsList)

#### Making the schema for dataframe

In [None]:
from pyspark.sql.types import *
import numpy as np

In [None]:
dfSchema = StructType([StructField("Hotel Name", StringType()),StructField("Checkin Date", DateType()),StructField("Discount Code",  IntegerType()),StructField("min(Discount Price)", LongType())])

#### Creating spark dataframe

In [None]:
s_df= spark.createDataFrame(helpTable, schema=dfSchema)

In [None]:
s_df.groupby(['Hotel name','Checkin Date','Discount Code']).count().show()

In [None]:
s_df.count()

#### Filter from dataframe that we already have only :Hotel Name, Checkin Date, Discount Code, Discount Price

In [None]:
#hotels_df=hotelsCheckin.select('Hotel Name', 'Checkin Date','Discount Code', 'Discount Price')

In [None]:
#hotels_df.groupby(['Hotel Name', 'Checkin Date','Discount Code']).count().show()

#### Union the dataframe with the table that we build

In [None]:
all_df=hotelsCheckin.union(s_df)

In [None]:
all_df.show()

In [None]:
alldf=all_df.groupby(['Hotel Name','Checkin Date','Discount Code']).max('min(Discount Price)')

In [None]:
#alldf.show(n=1000)

In [None]:
alldf

#### Group by to order the details

In [None]:
#all_df = all_df.groupBy('Hotel name','Checkin Date', 'Discount Code').min('Discount Price')

In [None]:
#all_df.show()

#### Partition data by hotel name

In [None]:
#all_dfparts=alldf.repartition("Hotel name")

## 4.d

#### Split to two groups:  with price  -1,  without -1(the others)

In [None]:
groupMinus = alldf.filter(col('max(min(Discount Price))') == -1)
groupWithout = alldf.filter(col('max(min(Discount Price))') > -1)

### Normalization

In [None]:
groupWithout.createOrReplaceTempView("normalization")

#### Creating SQLContext for SQL converting

In [None]:
from pyspark.sql import SQLContext
from pyspark.sql.window import Window
import pyspark.sql.functions as func

In [None]:
sqlContext = SQLContext(spark)

#### Preparing data for normalization

In [None]:
dataFrame = sqlContext.table("normalization")

In [None]:
#Partioning data for groups so we can applay function on groups
windowSpec = Window.partitionBy(groupWithout['Hotel name']) 

### Normalize function

In [None]:
normalize =  ((dataFrame['max(min(Discount Price))'] - func.min(dataFrame['max(min(Discount Price))']).over(windowSpec)) / (func.max(dataFrame['max(min(Discount Price))']).over(windowSpec) - func.min(dataFrame['max(min(Discount Price))']).over(windowSpec)) * 100)

In [None]:
normalized = dataFrame.select(
  dataFrame['Hotel Name'],
  dataFrame['Checkin Date'],
  dataFrame['Discount Code'],
  normalize.alias("Normal"))

#### Getting back the values -1

In [None]:
#Changing column name for same schema
groupMinus = groupMinus.withColumnRenamed('max(min(Discount Price))','Normal')

#### Union all data frames with sorting

In [None]:
normalized = normalized.union(groupMinus)
normalized = normalized.orderBy(['Hotel name','Checkin Date','Discount Code'])

In [None]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.linalg import Vectors,VectorUDT
from pyspark.ml.linalg import SparseVector, DenseVector

In [None]:
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())

In [None]:
normalList = normalized.groupBy('Hotel name').agg(F.collect_list('Normal').alias("features"))

In [None]:
#normal_toList.collect()

In [None]:
normalList.rdd.map(lambda x: len(x[1])).collect()

In [None]:
import pandas as pd
columns=['Hotel Name']
for i in range(1,161):
    columns.append(i)
df = pd.DataFrame(normalList.rdd.map(lambda x: [x[0]] + x[1]).take(150), columns=columns)


## 4.e

#### We received dataframe that each line represents one hotel and has 161 columns - one for the hotel name and 160 for the normalized prices

In [None]:
df.head()

In [None]:
df.columns

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [None]:
#featuresCols = [str(i) for i in range(1,161)]

In [None]:
#featuresCols

In [None]:
#vec_assembler= VectorAssembler(inputCols=featuresCols,outputCol='features')

In [None]:
df['Hotel Name'].count()

## 4.f

#### Drawing the dendrogram

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from itertools import islice 
from matplotlib import pyplot as plt


#### if we have in data frame nan replace in -1 (for check)

In [None]:
df=df.fillna(-1)

#### Insert to dist linkage - the table only values (without hotelnames)  ,uses the Ward variance minimization algorithm- algorithm 'word' 

In [None]:
dist = linkage(df.iloc[:, 1:],'ward')
type(dist)
#how classes and how hotels
dist.shape

#### Drawing -dendrogram

In [None]:
plt.figure(figsize=(20,10))
plt.title('Hierarchical Clustering')
plt.ylabel('Distance')
dendrogram(dist, labels=df['Hotel Name'].tolist(), leaf_rotation=90,leaf_font_size=8)
plt.show()