In [None]:
# A használt könyvtárak beolvasása
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="whitegrid")
from collections import Counter

In [None]:
# Az adatkészletek beolvasása
train_data = pd.read_csv('train.csv', delimiter=',')
test_data=pd.read_csv('test.csv', delimiter=',')

In [None]:
# A tanuló adatok leírása
train_data.describe()

In [None]:
# A tesztelő adatok leírása
test_data.describe()

In [None]:
len(test_data)

In [None]:
len(train_data)

In [None]:
# Nan értékek száma az oszlopokban
sns.heatmap(train_data.isnull())
plt.savefig("nan_columns.jpg")

# Nan értékek pontos száma
print(sum(pd.isnull(train_data['orig_destination_distance'])))
print(sum(pd.isnull(train_data['srch_ci'])))
print(sum(pd.isnull(train_data['srch_co'])))

In [None]:
print("Foglalások száma:\t", len(train_data[train_data['is_booking'].isin([1])])) #történt foglalás
print("Keresések száma:\t", len(train_data[train_data['is_booking'].isin([0])])) #csak keresés történt

sns.countplot(train_data['is_booking'])
plt.savefig('is_booking.jpg')

In [None]:
print("Telefonos keresések:\t", len(train_data[train_data['is_mobile'].isin([1])])) #telefonról történő keresés
print("Egyéb keresések:\t", len(train_data[train_data['is_mobile'].isin([0])])) #nem telefonról történő keresés

sns.countplot(train_data['is_mobile'])
plt.savefig('is_mobile.jpg')

In [None]:
print("Csomagban történő foglalás/keresés:\t", len(train_data[train_data['is_package'].isin([1])])) #csomagban történt foglalás/keresés
print("Nem csomagban történő foglalás/keresés:\t", len(train_data[train_data['is_package'].isin([0])])) #nem csomagban történt foglalás/keresés

sns.countplot(train_data['is_package'])
plt.savefig('is_package.jpg')

In [None]:
# Keresési dátumok szétbontása év, hónap, napra
train_data[["search_year", "search_mm", "day-time"]] = train_data["date_time"].str.split("-", expand=True)
train_data[["search_day", "time"]] = train_data["day-time"].str.split(" ", expand=True)
train_data[["srch_ci_year", "srch_ci_mm", "srch_ci_day"]] = train_data["srch_ci"].str.split("-", expand=True)
train_data = train_data.drop(columns=['day-time', 'time'])

In [None]:
# Hónapokból évszakok kiszámítása
def get_season(mm=""):
    if type(mm)==float:
        return None
    else:
        mm=int(mm)
    seasons={
        "winter": [12,1,2],
        "spring": [3,4,5],
        "summer": [6,7,8],
        "autumn": [9,10,11]
    }
    for season in seasons.keys():
        if mm in seasons[season]:
            return season

train_data['srch_ci_season']=train_data['srch_ci_mm'].apply(lambda x: get_season(x))

In [None]:
def calc_duration(fromDate, toDate):
    if type(fromDate) != float and type(toDate) != float:
        return (pd.Timestamp(fromDate) - pd.Timestamp(toDate)).days
    else:
        return None

train_data['stay_duration_day'] = train_data.apply(lambda x: calc_duration(x['srch_co'], x['srch_ci']),axis=1)
train_data['search_duration_day'] = train_data.apply(lambda x: calc_duration(x['srch_ci'], x['date_time']), axis=1)

In [None]:
# Null adatok kitörlése
train_data[train_data['stay_duration_day'] <= 0] = None
train_data[train_data['search_duration_day'] < 0] = None

In [None]:
# Összegzése a foglalt éjszakák számának
for i in range(1,367):
    d = Counter(train_data['stay_duration_day'])
    print(i, ": ", d[i])
print(sum(pd.isnull(train_data['stay_duration_day'])))

In [None]:
plt.figure(figsize=(50,20))

for x in train_data.columns:
    if x not in ['date_time', 'orig_destination_distance', 'srch_ci', 'srch_co', 'is_package', 'is_mobile', 'is_booking']:
        print(x + '\n')
        sns.countplot(train_data[x], palette="Set2")
        plt.savefig("{name}.jpg".format(name=x))

In [None]:
sns.countplot('posa_continent', hue='hotel_continent',data=train_data ,palette="Set3")
plt.savefig("posa_continent X hotel_continent.jpg")

In [None]:
sns.countplot('hotel_continent', hue='posa_continent',data=train_data ,palette="Set3")
plt.savefig("hotel_continent X posa_continent.jpg")

In [None]:
plt.figure(figsize=(100, 20))
sns.countplot('srch_ci_mm', hue='hotel_cluster',data=train_data)
plt.savefig("srch_ci_mm X hotel_cluster.jpg")

In [None]:
plt.figure(figsize=(100, 20))
sns.countplot('hotel_cluster', hue='srch_ci_mm',data=train_data)
plt.savefig("hotel_cluster X srch_ci_mm.jpg")

In [None]:
sns.scatterplot(train_data['user_location_country'],train_data['posa_continent'])

In [None]:
sns.scatterplot(train_data['hotel_continent'],train_data['user_location_country'])

In [None]:
sns.scatterplot(train_data['posa_continent'],train_data['site_name'])