# Data Preprocessing

Train/Dev/Test splits, outlier trimming, filling in missing values etc. should be done here. Save new DataFrame to a file whne done.

In [12]:
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Train/Dev Splits

In [13]:
all_data = pd.read_csv("data/Train.csv", index_col="Tour_ID").sort_index()
descriptions = pd.read_csv("data/VariableDefinitions.csv")
descriptions

Unnamed: 0,Column Name,Definition
0,id,Unique identifier for each tourist
1,country,The country a tourist coming from.
2,age_group,The age group of a tourist.
3,travel_with,The relation of people a tourist travel with t...
4,total_female,Total number of females
5,total_male,Total number of males
6,purpose,The purpose of visiting Tanzania
7,main_activity,The main activity of tourism in Tanzania
8,infor_source,The source of information about tourism in Tan...
9,tour_arrangment,The arrangment of visiting Tanzania


In [14]:
all_data.shape

(18506, 20)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(all_data.iloc[:, :-1], all_data.iloc[:,-1], test_size=0.3, random_state=13)
y_test

Tour_ID
tour_idoz4s8ore     Higher Cost
tour_idmrpgckx9      Lower Cost
tour_idb9tw1xhe     Higher Cost
tour_id15p5ryu5       High Cost
tour_idbt8hghl6      Lower Cost
                       ...     
tour_id669wplzi        Low Cost
tour_idkky33aii     Higher Cost
tour_idgiqq4jp5    Highest Cost
tour_id5pfwrcvu        Low Cost
tour_idoh2wy0j4       High Cost
Name: cost_category, Length: 5552, dtype: object

Now the training set is in the train DataFrame, while the dev set is in the dev DataFrame.

### One-Hot Encoding

One-Hot Encoding is a way to represent our categorical variables as numbers, and hence make them compatible with a ML model. In this convention, each "type" in a single column is made into a column of its own, and the rows which have this type are assigned a 1 for that type's respective column and 0 otherwise.

In [19]:
train_encoded = pd.get_dummies(X_train, columns = ['country','age_group', 'travel_with', 'purpose', 'main_activity',
'info_source', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'first_trip_tz'])

test_encoded = pd.get_dummies(X_test, columns = ['country','age_group', 'travel_with', 'purpose', 'main_activity',
'info_source', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'first_trip_tz'])

train_y_encoded = pd.get_dummies(y_train, columns = ['cost_category'])

test_y_encoded = train_y_encoded = pd.get_dummies(y_test, columns = ['cost_category'])

In [23]:
test_encoded.head()

Unnamed: 0_level_0,total_female,total_male,night_mainland,night_zanzibar,country_ALGERIA,country_ARGENTINA,country_AUSTRALIA,country_AUSTRIA,country_BAHRAIN,country_BANGLADESH,...,package_transport_tz_No,package_transport_tz_Yes,package_sightseeing_No,package_sightseeing_Yes,package_guided_tour_No,package_guided_tour_Yes,package_insurance_No,package_insurance_Yes,first_trip_tz_No,first_trip_tz_Yes
Tour_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tour_idoz4s8ore,2.0,1.0,6,10,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,0,1
tour_idmrpgckx9,0.0,1.0,2,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,1,0
tour_idb9tw1xhe,2.0,3.0,6,6,0,0,0,0,0,0,...,0,1,0,1,0,1,0,1,0,1
tour_id15p5ryu5,2.0,1.0,0,8,0,0,0,0,0,0,...,1,0,1,0,1,0,0,1,0,1
tour_idbt8hghl6,0.0,2.0,4,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,0,1
