In [1]:
import numpy as np
import pandas as pd

## Remove dirty data

In [2]:
with open('data/train.csv', 'r') as rf:
    with open('data/train_clean.csv', 'w') as wf:
        for line in rf:
            if ',,' not in line and ',\n' not in line:
                wf.write(line)
                
train_clean = pd.read_csv('data/train_clean.csv', delimiter = ',')
print("Length of the cleaned train data: {}".format(len(train_clean)))

Length of the cleaned train data: 24117894


# Modification function

In [3]:
# Hónapokból évszakok kiszámítása
def get_season(mm=""):
    if type(mm)==float:
        return None
    else:
        mm=int(mm)
    seasons={
        "1": [12,1,2], #winter
        "2": [3,4,5],  #spring
        "3": [6,7,8],  #summer
        "4": [9,10,11] #autumn
    }
    for season in seasons.keys():
        if mm in seasons[season]:
            return season

def mod_df(data = pd.DataFrame()):
    
    # Keresési dátumok szétbontása év, hónap, napra
    data[["srch_ci_year", "srch_ci_mm", "srch_ci_day"]] = data["srch_ci"].str.split("-", expand=True)
    
    data['srch_ci_season']=data['srch_ci_mm'].apply(lambda x: get_season(x))
    
    data['srch_ppl_cnt'] = data.apply(lambda x: (x['srch_adults_cnt'] + x['srch_children_cnt']),axis=1)
    
    columns = ['date_time','user_id','user_location_country','srch_adults_cnt', 
               'channel','site_name','srch_children_cnt','user_location_region', 
               'srch_co','posa_continent','hotel_continent','srch_destination_type_id',
               'srch_ci','orig_destination_distance','srch_ci_year','srch_ci_day','srch_ci_mm']
    data = data.drop(columns, axis=1)
    
    data = data.fillna(0)
    data = data.astype('int')
    
    return data

### Create train data

In [4]:
read_every = 11
with open('data/train_clean.csv', 'r') as rf:
    with open('data/train_clean_100e_line.csv', 'w') as wf:
        counter = 0
        rows = 0
        for line in rf:
            if (counter == 0 or counter%read_every==0) and rows<=100000:
                wf.write(line)
                rows+=1
            counter+=1
            
train_data = pd.read_csv('data/train_clean_100e_line.csv', delimiter = ',')
print("Length of the train data: {}".format(len(train_data)))

Length of the train data: 100000


In [5]:
train_data.columns

Index(['date_time', 'site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'is_booking', 'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster'],
      dtype='object')

In [6]:
train_data = mod_df(train_data)

In [7]:
train_data.columns

Index(['user_location_city', 'is_mobile', 'is_package', 'srch_rm_cnt',
       'srch_destination_id', 'is_booking', 'cnt', 'hotel_country',
       'hotel_market', 'hotel_cluster', 'srch_ci_season', 'srch_ppl_cnt'],
      dtype='object')

In [8]:
train_data.to_csv('data/train_clean_2m_line_modified.csv', sep=',', index=False)

### Create validation data

In [9]:
read_every = 19
with open('data/train_clean.csv', 'r') as rf:
    with open('data/validation_1m_line.csv', 'w') as wf:
        counter = 0
        rows = 0 
        for line in rf:
            if (counter == 0 or counter%read_every==0) and rows<=1000000:
                wf.write(line)
                rows+=1
            counter+=1
            
validation_data = pd.read_csv('data/validation_1m_line.csv', delimiter = ',')
print("Length of the validation data: {}".format(len(validation_data)))

Length of the validation data: 1000000


In [10]:
validation_data.columns

Index(['date_time', 'site_name', 'posa_continent', 'user_location_country',
       'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'is_booking', 'cnt', 'hotel_continent', 'hotel_country', 'hotel_market',
       'hotel_cluster'],
      dtype='object')

In [11]:
validation_data = mod_df(validation_data)

In [12]:
validation_data.dtypes

user_location_city     int64
is_mobile              int64
is_package             int64
srch_rm_cnt            int64
srch_destination_id    int64
is_booking             int64
cnt                    int64
hotel_country          int64
hotel_market           int64
hotel_cluster          int64
srch_ci_season         int64
srch_ppl_cnt           int64
dtype: object

In [13]:
validation_data.columns

Index(['user_location_city', 'is_mobile', 'is_package', 'srch_rm_cnt',
       'srch_destination_id', 'is_booking', 'cnt', 'hotel_country',
       'hotel_market', 'hotel_cluster', 'srch_ci_season', 'srch_ppl_cnt'],
      dtype='object')

In [14]:
validation_data.to_csv('data/validation_clean_1m_line_modified.csv', sep=',', index=False)

## Create test data

In [15]:
test_data = pd.read_csv('data/test.csv', delimiter = ',')

In [16]:
test_data.columns

Index(['id', 'date_time', 'site_name', 'posa_continent',
       'user_location_country', 'user_location_region', 'user_location_city',
       'orig_destination_distance', 'user_id', 'is_mobile', 'is_package',
       'channel', 'srch_ci', 'srch_co', 'srch_adults_cnt', 'srch_children_cnt',
       'srch_rm_cnt', 'srch_destination_id', 'srch_destination_type_id',
       'hotel_continent', 'hotel_country', 'hotel_market'],
      dtype='object')

In [17]:
test_data = mod_df(test_data)

In [18]:
test_data.columns

Index(['id', 'user_location_city', 'is_mobile', 'is_package', 'srch_rm_cnt',
       'srch_destination_id', 'hotel_country', 'hotel_market',
       'srch_ci_season', 'srch_ppl_cnt'],
      dtype='object')

In [19]:
test_data.to_csv('data/test_modified.csv', sep=',', index=False)