In [1]:
#! pip install category_encoders kmodes
#! pip install --upgrade pip

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import numpy as np

import numpy as np
from kmodes.kprototypes import KPrototypes

  (fname, cnt))
  (fname, cnt))


In [45]:
pd.options.display.max_columns = None

In [4]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,START_RENTAL_ZONE_HAL_ID,END_RENTAL_ZONE,END_RENTAL_ZONE_HAL_ID,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,213687.0,Bebelallee/Meenkwiese,213855.0,Hamburg,Android,28.8,1
1,108371,Stadthausbrücke / Neuer Wall,138379.0,Goldbekplatz / Semperstraße,140796.0,Hamburg,iPhone,27.05,1
2,119829,Jungfernstieg / Ballindamm,131879.0,Hauptbahnhof Ost / Hachmannplatz,131873.0,Hamburg,iPhone,11.216667,1
3,116810,Hudtwalckerstraße / Bebelallee,138378.0,Kellinghusenstraße / Loogeplatz,140795.0,Hamburg,others,75.383333,1
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,131883.0,Feldstraße / Marktstraße,244093.0,Hamburg,Android,8.983333,1


In [6]:
cluster_2016['Month'].value_counts()

5    649252
4    495376
3    310640
2    262906
1    229704
6     52122
Name: Month, dtype: int64

In [7]:
# preprocessing to get the original data

cleanup_months = {"Month":     {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


### Total classes  of categorical Values

In [8]:
len(cluster_2016['VEHICLE_HAL_ID'].unique())

10001

In [9]:
len(cluster_2016['START_RENTAL_ZONE'].unique())

797

In [10]:
len(cluster_2016['END_RENTAL_ZONE'].unique())

798

In [11]:
len(cluster_2016['CITY_RENTAL_ZONE'].unique())

45

In [12]:
len(cluster_2016['TECHNICAL_INCOME_CHANNEL'].unique())

6

In [13]:
len(cluster_2016['Month'].unique())

6

##  Approach #1 - Find and Replace

* Pandas makes it easy for us to directly replace the text values with their numeric equivalent by using replace .
* we are going to create a mapping dictionary that contains each column to process as well as a dictionary of the values to translate.
* But it can apply to category which  have less number of classes as we save it  as a dictionary.
* While this approach may only work in certain scenarios it is a very useful demonstration of how to convert text values to numeric when there is an “easy” human interpretation of the data. This concept is also useful for more general data cleanup.


In [14]:
cluster_2016["Month"].value_counts()

May         649252
April       495376
March       310640
February    262906
January     229704
June         52122
Name: Month, dtype: int64

In [15]:
cluster_2016["TECHNICAL_INCOME_CHANNEL"].value_counts()

iPhone        692143
Android       530832
others        467302
IVR           284900
Windows        24607
BlackBerry       216
Name: TECHNICAL_INCOME_CHANNEL, dtype: int64

In [16]:
cleanup_nums = {"Month":     {"January": 1, "February": 2,"March":3,"April":4,"May":5,"June":6},
                "TECHNICAL_INCOME_CHANNEL": {"iPhone": 1, "Android": 2, "others": 3, "IVR": 4,
                                  "Windows": 5, "BlackBerry": 6}}



In [17]:
cluster_2016.replace(cleanup_nums, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,2,28.8,1
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,1,27.05,1
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,1,11.216667,1
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,3,75.383333,1
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,2,8.983333,1


## Approach #2 - Label Encoding

* Another approach to encoding categorical values is to use a technique called label encoding.
* Label encoding is simply converting each value in a column to a number.
* For example, the TECHNICAL_INCOME_CHANNEL column contains 6 different values. We could choose to encode it like this:
    * iPhone -> 0
    * Android -> 1
    * others -> 2
    * IVR -> 3
    * Windows -> 4
    * BlackBerry -> 5
* Label encoding has the advantage that it is straightforward but it has the disadvantage that the numeric values can be “misinterpreted” by the algorithms. For example, the value of 0 is obviously less than the value of 4 but does that really correspond to the data set in real life? Does a wagon have “4X” more weight in our calculation than the convertible? In this example, I don’t think so.    

In [18]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month":     {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [19]:
cluster_2016["START_RENTAL_ZONE"] = cluster_2016["START_RENTAL_ZONE"].astype('category')
cluster_2016["END_RENTAL_ZONE"] = cluster_2016["END_RENTAL_ZONE"].astype('category')
cluster_2016["CITY_RENTAL_ZONE"] = cluster_2016["CITY_RENTAL_ZONE"].astype('category')
cluster_2016["Month"] = cluster_2016["Month"].astype('category')
cluster_2016["VEHICLE_HAL_ID"] = cluster_2016["VEHICLE_HAL_ID"].astype('category')
cluster_2016["TECHNICAL_INCOME_CHANNEL"] = cluster_2016["TECHNICAL_INCOME_CHANNEL"].astype('category')

In [20]:
cluster_2016["Month"] = cluster_2016["Month"].astype('category')
cluster_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
VEHICLE_HAL_ID              category
START_RENTAL_ZONE           category
END_RENTAL_ZONE             category
CITY_RENTAL_ZONE            category
TECHNICAL_INCOME_CHANNEL    category
Ride_Duration_new           float64
Month                       category
dtypes: category(6), float64(1)
memory usage: 32.9 MB


In [21]:
cluster_2016["START_RENTAL_ZONE"] = cluster_2016["START_RENTAL_ZONE"].cat.codes
cluster_2016["END_RENTAL_ZONE"] = cluster_2016["END_RENTAL_ZONE"].cat.codes
cluster_2016["CITY_RENTAL_ZONE"] = cluster_2016["CITY_RENTAL_ZONE"].cat.codes
cluster_2016["Month"] = cluster_2016["Month"].cat.codes
cluster_2016["VEHICLE_HAL_ID"] = cluster_2016["VEHICLE_HAL_ID"].cat.codes
cluster_2016["TECHNICAL_INCOME_CHANNEL"] = cluster_2016["TECHNICAL_INCOME_CHANNEL"].cat.codes
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,1794,796,115,18,0,28.8,2
1,1574,683,318,18,4,27.05,2
2,6517,422,377,18,4,11.216667,2
3,5127,401,437,18,5,75.383333,2
4,6984,94,266,18,0,8.983333,2


## Approach #3 - One Hot Encoding

* A common alternative approach is called one hot encoding. The basic strategy is to convert each category value into a new column and assigns a 1 or 0 (True/False) value to the column. This has the benefit of not weighting a value improperly but does have the downside of adding more columns to the data set.


* Pandas supports this feature using get_dummies. This function is named this way because it creates dummy/indicator variables (aka 1 or 0).


* Hopefully a simple example will make this more clear. We can look at the column "TECHNICAL_INCOME_CHANNEL" where we have values of iphone , Android and so on . By using get_dummies we can convert this to six columns with a 1 or 0.


* Again this can't be used for the category which have lots of class


* So this can't be apply to other classes. 

In [22]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month":     {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [23]:
pd.get_dummies(cluster_2016, columns=["TECHNICAL_INCOME_CHANNEL"]).head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,Ride_Duration_new,Month,TECHNICAL_INCOME_CHANNEL_Android,TECHNICAL_INCOME_CHANNEL_BlackBerry,TECHNICAL_INCOME_CHANNEL_IVR,TECHNICAL_INCOME_CHANNEL_Windows,TECHNICAL_INCOME_CHANNEL_iPhone,TECHNICAL_INCOME_CHANNEL_others
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8,January,1,0,0,0,0,0
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05,January,0,0,0,0,1,0
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667,January,0,0,0,0,1,0
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333,January,0,0,0,0,0,1
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333,January,1,0,0,0,0,0


In [24]:
pd.get_dummies(cluster_2016, columns=["TECHNICAL_INCOME_CHANNEL"], prefix=["channel"]).head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,Ride_Duration_new,Month,channel_Android,channel_BlackBerry,channel_IVR,channel_Windows,channel_iPhone,channel_others
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8,January,1,0,0,0,0,0
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05,January,0,0,0,0,1,0
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667,January,0,0,0,0,1,0
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333,January,0,0,0,0,0,1
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333,January,1,0,0,0,0,0


## Approach #4 - Custom Binary Encoding

* Depending on the data set, you may be able to use some combination of label encoding and one hot encoding to create a binary column that meets your needs for further analysis.

* In this particular data set, there is a column called "TECHNICAL_INCOME_CHANNEL" that contains several different values.


* For the sake of discussion, maybe all we care about is whether or not the channel is an iphone or not. We could use the str accessor plus np.where to create a new column the indicates whether or not the channel is an iphone or not.

* It is dataset specific if we need a special class of a categorical features.

In [25]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month":     {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [26]:
cluster_2016["CHANNEL_iPhone"] = np.where(cluster_2016["TECHNICAL_INCOME_CHANNEL"].str.contains("iPhone"), 1,0)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month,CHANNEL_iPhone
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January,0
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January,1
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January,1
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January,0
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January,0


## Approach #5 - Backward Difference Coding


In [50]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month":     {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [28]:
cluster_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
VEHICLE_HAL_ID              int64
START_RENTAL_ZONE           object
END_RENTAL_ZONE             object
CITY_RENTAL_ZONE            object
TECHNICAL_INCOME_CHANNEL    object
Ride_Duration_new           float64
Month                       object
dtypes: float64(1), int64(1), object(5)
memory usage: 106.8+ MB


In [52]:
import category_encoders as ce

# Get a new clean dataframe
# cluster_2016 = cluster_2016.select_dtypes(exclude=['float64'])

# Specify the columns to encode then fit and transform
encoder = ce.backward_difference.BackwardDifferenceEncoder(cols=["CITY_RENTAL_ZONE","TECHNICAL_INCOME_CHANNEL","Month"])
encoder.fit(cluster_2016, verbose=1)

# Only display the first 5 columns for brevity
backd_df = encoder.transform(cluster_2016)

In [62]:
backd_df.head()

Unnamed: 0,col_CITY_RENTAL_ZONE_0,col_CITY_RENTAL_ZONE_1,col_CITY_RENTAL_ZONE_2,col_CITY_RENTAL_ZONE_3,col_CITY_RENTAL_ZONE_4,col_CITY_RENTAL_ZONE_5,col_CITY_RENTAL_ZONE_6,col_CITY_RENTAL_ZONE_7,col_CITY_RENTAL_ZONE_8,col_CITY_RENTAL_ZONE_9,col_CITY_RENTAL_ZONE_10,col_CITY_RENTAL_ZONE_11,col_CITY_RENTAL_ZONE_12,col_CITY_RENTAL_ZONE_13,col_CITY_RENTAL_ZONE_14,col_CITY_RENTAL_ZONE_15,col_CITY_RENTAL_ZONE_16,col_CITY_RENTAL_ZONE_17,col_CITY_RENTAL_ZONE_18,col_CITY_RENTAL_ZONE_19,col_CITY_RENTAL_ZONE_20,col_CITY_RENTAL_ZONE_21,col_CITY_RENTAL_ZONE_22,col_CITY_RENTAL_ZONE_23,col_CITY_RENTAL_ZONE_24,col_CITY_RENTAL_ZONE_25,col_CITY_RENTAL_ZONE_26,col_CITY_RENTAL_ZONE_27,col_CITY_RENTAL_ZONE_28,col_CITY_RENTAL_ZONE_29,col_CITY_RENTAL_ZONE_30,col_CITY_RENTAL_ZONE_31,col_CITY_RENTAL_ZONE_32,col_CITY_RENTAL_ZONE_33,col_CITY_RENTAL_ZONE_34,col_CITY_RENTAL_ZONE_35,col_CITY_RENTAL_ZONE_36,col_CITY_RENTAL_ZONE_37,col_CITY_RENTAL_ZONE_38,col_CITY_RENTAL_ZONE_39,col_CITY_RENTAL_ZONE_40,col_CITY_RENTAL_ZONE_41,col_CITY_RENTAL_ZONE_42,col_CITY_RENTAL_ZONE_43,col_CITY_RENTAL_ZONE_44,col_TECHNICAL_INCOME_CHANNEL_0,col_TECHNICAL_INCOME_CHANNEL_1,col_TECHNICAL_INCOME_CHANNEL_2,col_TECHNICAL_INCOME_CHANNEL_3,col_TECHNICAL_INCOME_CHANNEL_4,col_TECHNICAL_INCOME_CHANNEL_5,col_Month_0,col_Month_1,col_Month_2,col_Month_3,col_Month_4,col_Month_5,col_VEHICLE_HAL_ID,col_START_RENTAL_ZONE,col_END_RENTAL_ZONE,col_Ride_Duration_new
0,1.0,-0.977778,-0.955556,-0.933333,-0.911111,-0.888889,-0.866667,-0.844444,-0.822222,-0.8,-0.777778,-0.755556,-0.733333,-0.711111,-0.688889,-0.666667,-0.644444,-0.622222,-0.6,-0.577778,-0.555556,-0.533333,-0.511111,-0.488889,-0.466667,-0.444444,-0.422222,-0.4,-0.377778,-0.355556,-0.333333,-0.311111,-0.288889,-0.266667,-0.244444,-0.222222,-0.2,-0.177778,-0.155556,-0.133333,-0.111111,-0.088889,-0.066667,-0.044444,-0.022222,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,28.8
1,1.0,-0.977778,-0.955556,-0.933333,-0.911111,-0.888889,-0.866667,-0.844444,-0.822222,-0.8,-0.777778,-0.755556,-0.733333,-0.711111,-0.688889,-0.666667,-0.644444,-0.622222,-0.6,-0.577778,-0.555556,-0.533333,-0.511111,-0.488889,-0.466667,-0.444444,-0.422222,-0.4,-0.377778,-0.355556,-0.333333,-0.311111,-0.288889,-0.266667,-0.244444,-0.222222,-0.2,-0.177778,-0.155556,-0.133333,-0.111111,-0.088889,-0.066667,-0.044444,-0.022222,1.0,0.166667,-0.666667,-0.5,-0.333333,-0.166667,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,27.05
2,1.0,-0.977778,-0.955556,-0.933333,-0.911111,-0.888889,-0.866667,-0.844444,-0.822222,-0.8,-0.777778,-0.755556,-0.733333,-0.711111,-0.688889,-0.666667,-0.644444,-0.622222,-0.6,-0.577778,-0.555556,-0.533333,-0.511111,-0.488889,-0.466667,-0.444444,-0.422222,-0.4,-0.377778,-0.355556,-0.333333,-0.311111,-0.288889,-0.266667,-0.244444,-0.222222,-0.2,-0.177778,-0.155556,-0.133333,-0.111111,-0.088889,-0.066667,-0.044444,-0.022222,1.0,0.166667,-0.666667,-0.5,-0.333333,-0.166667,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,11.216667
3,1.0,-0.977778,-0.955556,-0.933333,-0.911111,-0.888889,-0.866667,-0.844444,-0.822222,-0.8,-0.777778,-0.755556,-0.733333,-0.711111,-0.688889,-0.666667,-0.644444,-0.622222,-0.6,-0.577778,-0.555556,-0.533333,-0.511111,-0.488889,-0.466667,-0.444444,-0.422222,-0.4,-0.377778,-0.355556,-0.333333,-0.311111,-0.288889,-0.266667,-0.244444,-0.222222,-0.2,-0.177778,-0.155556,-0.133333,-0.111111,-0.088889,-0.066667,-0.044444,-0.022222,1.0,0.166667,0.333333,-0.5,-0.333333,-0.166667,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,75.383333
4,1.0,-0.977778,-0.955556,-0.933333,-0.911111,-0.888889,-0.866667,-0.844444,-0.822222,-0.8,-0.777778,-0.755556,-0.733333,-0.711111,-0.688889,-0.666667,-0.644444,-0.622222,-0.6,-0.577778,-0.555556,-0.533333,-0.511111,-0.488889,-0.466667,-0.444444,-0.422222,-0.4,-0.377778,-0.355556,-0.333333,-0.311111,-0.288889,-0.266667,-0.244444,-0.222222,-0.2,-0.177778,-0.155556,-0.133333,-0.111111,-0.088889,-0.066667,-0.044444,-0.022222,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,1.0,-0.833333,-0.666667,-0.5,-0.333333,-0.166667,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,8.983333


In [59]:
backd_df.shape

(2000000, 61)

## Approach #6 - Polynomial Coding

In [54]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month": {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [55]:
encoder = ce.polynomial.PolynomialEncoder(cols=["CITY_RENTAL_ZONE","TECHNICAL_INCOME_CHANNEL","Month"])
encoder.fit(cluster_2016, verbose=1)
poly_df = encoder.transform(cluster_2016)

In [56]:
poly_df.head()

Unnamed: 0,col_CITY_RENTAL_ZONE_0,col_CITY_RENTAL_ZONE_1,col_CITY_RENTAL_ZONE_2,col_CITY_RENTAL_ZONE_3,col_CITY_RENTAL_ZONE_4,col_CITY_RENTAL_ZONE_5,col_CITY_RENTAL_ZONE_6,col_CITY_RENTAL_ZONE_7,col_CITY_RENTAL_ZONE_8,col_CITY_RENTAL_ZONE_9,col_CITY_RENTAL_ZONE_10,col_CITY_RENTAL_ZONE_11,col_CITY_RENTAL_ZONE_12,col_CITY_RENTAL_ZONE_13,col_CITY_RENTAL_ZONE_14,col_CITY_RENTAL_ZONE_15,col_CITY_RENTAL_ZONE_16,col_CITY_RENTAL_ZONE_17,col_CITY_RENTAL_ZONE_18,col_CITY_RENTAL_ZONE_19,col_CITY_RENTAL_ZONE_20,col_CITY_RENTAL_ZONE_21,col_CITY_RENTAL_ZONE_22,col_CITY_RENTAL_ZONE_23,col_CITY_RENTAL_ZONE_24,col_CITY_RENTAL_ZONE_25,col_CITY_RENTAL_ZONE_26,col_CITY_RENTAL_ZONE_27,col_CITY_RENTAL_ZONE_28,col_CITY_RENTAL_ZONE_29,col_CITY_RENTAL_ZONE_30,col_CITY_RENTAL_ZONE_31,col_CITY_RENTAL_ZONE_32,col_CITY_RENTAL_ZONE_33,col_CITY_RENTAL_ZONE_34,col_CITY_RENTAL_ZONE_35,col_CITY_RENTAL_ZONE_36,col_CITY_RENTAL_ZONE_37,col_CITY_RENTAL_ZONE_38,col_CITY_RENTAL_ZONE_39,col_CITY_RENTAL_ZONE_40,col_CITY_RENTAL_ZONE_41,col_CITY_RENTAL_ZONE_42,col_CITY_RENTAL_ZONE_43,col_CITY_RENTAL_ZONE_44,col_TECHNICAL_INCOME_CHANNEL_0,col_TECHNICAL_INCOME_CHANNEL_1,col_TECHNICAL_INCOME_CHANNEL_2,col_TECHNICAL_INCOME_CHANNEL_3,col_TECHNICAL_INCOME_CHANNEL_4,col_TECHNICAL_INCOME_CHANNEL_5,col_Month_0,col_Month_1,col_Month_2,col_Month_3,col_Month_4,col_Month_5,col_VEHICLE_HAL_ID,col_START_RENTAL_ZONE,col_END_RENTAL_ZONE,col_Ride_Duration_new
0,1.0,-0.252523,0.311825,-0.345127,0.357969,-0.353969,0.336502,-0.308996,0.274849,-0.237247,0.198969,-0.162251,0.12871,-0.099354,0.074637,-0.054566,0.038818,-0.026865,0.018083,-0.011833,0.007524,-0.004647,0.002785,-0.001619,0.000912,-0.000497,0.000262,-0.000134,6.6e-05,-3.1e-05,1.4e-05,-6e-06,3e-06,-1e-06,3.875912e-07,-1.389398e-07,4.705947e-08,-1.487207e-08,4.326767e-09,-1.137028e-09,2.039489e-10,-1.916536e-10,1.269234e-10,7.651156e-13,2.336223e-11,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,28.8
1,1.0,-0.252523,0.311825,-0.345127,0.357969,-0.353969,0.336502,-0.308996,0.274849,-0.237247,0.198969,-0.162251,0.12871,-0.099354,0.074637,-0.054566,0.038818,-0.026865,0.018083,-0.011833,0.007524,-0.004647,0.002785,-0.001619,0.000912,-0.000497,0.000262,-0.000134,6.6e-05,-3.1e-05,1.4e-05,-6e-06,3e-06,-1e-06,3.875912e-07,-1.389398e-07,4.705947e-08,-1.487207e-08,4.326767e-09,-1.137028e-09,2.039489e-10,-1.916536e-10,1.269234e-10,7.651156e-13,2.336223e-11,1.0,-0.358569,-0.109109,0.521749,-0.566947,0.31497,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,27.05
2,1.0,-0.252523,0.311825,-0.345127,0.357969,-0.353969,0.336502,-0.308996,0.274849,-0.237247,0.198969,-0.162251,0.12871,-0.099354,0.074637,-0.054566,0.038818,-0.026865,0.018083,-0.011833,0.007524,-0.004647,0.002785,-0.001619,0.000912,-0.000497,0.000262,-0.000134,6.6e-05,-3.1e-05,1.4e-05,-6e-06,3e-06,-1e-06,3.875912e-07,-1.389398e-07,4.705947e-08,-1.487207e-08,4.326767e-09,-1.137028e-09,2.039489e-10,-1.916536e-10,1.269234e-10,7.651156e-13,2.336223e-11,1.0,-0.358569,-0.109109,0.521749,-0.566947,0.31497,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,11.216667
3,1.0,-0.252523,0.311825,-0.345127,0.357969,-0.353969,0.336502,-0.308996,0.274849,-0.237247,0.198969,-0.162251,0.12871,-0.099354,0.074637,-0.054566,0.038818,-0.026865,0.018083,-0.011833,0.007524,-0.004647,0.002785,-0.001619,0.000912,-0.000497,0.000262,-0.000134,6.6e-05,-3.1e-05,1.4e-05,-6e-06,3e-06,-1e-06,3.875912e-07,-1.389398e-07,4.705947e-08,-1.487207e-08,4.326767e-09,-1.137028e-09,2.039489e-10,-1.916536e-10,1.269234e-10,7.651156e-13,2.336223e-11,1.0,-0.119523,-0.436436,0.298142,0.377964,-0.629941,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,75.383333
4,1.0,-0.252523,0.311825,-0.345127,0.357969,-0.353969,0.336502,-0.308996,0.274849,-0.237247,0.198969,-0.162251,0.12871,-0.099354,0.074637,-0.054566,0.038818,-0.026865,0.018083,-0.011833,0.007524,-0.004647,0.002785,-0.001619,0.000912,-0.000497,0.000262,-0.000134,6.6e-05,-3.1e-05,1.4e-05,-6e-06,3e-06,-1e-06,3.875912e-07,-1.389398e-07,4.705947e-08,-1.487207e-08,4.326767e-09,-1.137028e-09,2.039489e-10,-1.916536e-10,1.269234e-10,7.651156e-13,2.336223e-11,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,1.0,-0.597614,0.545545,-0.372678,0.188982,-0.062994,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,8.983333


In [57]:
poly_df.shape

(2000000, 61)

# Approach #7 - Hashing

In [74]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month": {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [81]:
encoder = ce.hashing.HashingEncoder(cols=['TECHNICAL_INCOME_CHANNEL', 'Month'])
encoder.fit(cluster_2016, verbose=1)
hash_df = encoder.transform(cluster_2016)

In [82]:
hash_df.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,Ride_Duration_new
0,0,0,0,0,0,1,1,0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8
1,0,1,0,0,0,0,1,0,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05
2,0,1,0,0,0,0,1,0,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667
3,0,0,0,1,0,0,1,0,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333
4,0,0,0,0,0,1,1,0,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333


# Approach #8 - Helmert Coding

In [85]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month": {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [86]:
import category_encoders as ce
encoder = ce.helmert.HelmertEncoder(cols=["TECHNICAL_INCOME_CHANNEL","Month"])
encoder.fit(cluster_2016, verbose=1)
helm_df = encoder.transform(cluster_2016)

In [87]:
helm_df.head()

Unnamed: 0,col_TECHNICAL_INCOME_CHANNEL_0,col_TECHNICAL_INCOME_CHANNEL_1,col_TECHNICAL_INCOME_CHANNEL_2,col_TECHNICAL_INCOME_CHANNEL_3,col_TECHNICAL_INCOME_CHANNEL_4,col_TECHNICAL_INCOME_CHANNEL_5,col_Month_0,col_Month_1,col_Month_2,col_Month_3,col_Month_4,col_Month_5,col_VEHICLE_HAL_ID,col_START_RENTAL_ZONE,col_END_RENTAL_ZONE,col_CITY_RENTAL_ZONE,col_Ride_Duration_new
0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8
1,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05
2,1.0,1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667
3,1.0,0.0,2.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333
4,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333


# Approach #9 - Binary Encoder

In [93]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month": {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [94]:
import category_encoders as ce
encoder = ce.binary.BinaryEncoder(cols=["TECHNICAL_INCOME_CHANNEL","Month"])
encoder.fit(cluster_2016, verbose=1)
binary_df = encoder.transform(cluster_2016)

In [95]:
binary_df.head()

Unnamed: 0,TECHNICAL_INCOME_CHANNEL_0,TECHNICAL_INCOME_CHANNEL_1,TECHNICAL_INCOME_CHANNEL_2,Month_0,Month_1,Month_2,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,Ride_Duration_new
0,0,0,0,0,0,0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8
1,0,0,1,0,0,0,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05
2,0,0,1,0,0,0,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667
3,0,1,0,0,0,0,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333
4,0,0,0,0,0,0,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333


# Approach #10 - Sum Coding

In [2]:
cluster_2016 = pd.read_csv('cluster_data_less.csv',index_col = False)
# dropping  the index column
cluster_2016 = cluster_2016.drop(columns=['Unnamed: 0'])
cluster_2016 = cluster_2016.drop(columns=['START_RENTAL_ZONE_HAL_ID', 'END_RENTAL_ZONE_HAL_ID'])
cleanup_months = {"Month": {6:'June',5:'May',4:"April",3:'March' ,2: "February",1:"January"}}
cluster_2016.replace(cleanup_months, inplace=True)
cluster_2016.head()

Unnamed: 0,VEHICLE_HAL_ID,START_RENTAL_ZONE,END_RENTAL_ZONE,CITY_RENTAL_ZONE,TECHNICAL_INCOME_CHANNEL,Ride_Duration_new,Month
0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,Android,28.8,January
1,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,iPhone,27.05,January
2,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,iPhone,11.216667,January
3,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,others,75.383333,January
4,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,Android,8.983333,January


In [3]:
import category_encoders as ce
encoder = ce.sum_coding.SumEncoder(cols=["TECHNICAL_INCOME_CHANNEL","Month"])
encoder.fit(cluster_2016, verbose=1)
sum_df = encoder.transform(cluster_2016)

In [4]:
sum_df.head()

Unnamed: 0,col_TECHNICAL_INCOME_CHANNEL_0,col_TECHNICAL_INCOME_CHANNEL_1,col_TECHNICAL_INCOME_CHANNEL_2,col_TECHNICAL_INCOME_CHANNEL_3,col_TECHNICAL_INCOME_CHANNEL_4,col_TECHNICAL_INCOME_CHANNEL_5,col_Month_0,col_Month_1,col_Month_2,col_Month_3,col_Month_4,col_Month_5,col_VEHICLE_HAL_ID,col_START_RENTAL_ZONE,col_END_RENTAL_ZONE,col_CITY_RENTAL_ZONE,col_Ride_Duration_new
0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,108647,Überseering/Mexikoring,Bebelallee/Meenkwiese,Hamburg,28.8
1,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,108371,Stadthausbrücke / Neuer Wall,Goldbekplatz / Semperstraße,Hamburg,27.05
2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,119829,Jungfernstieg / Ballindamm,Hauptbahnhof Ost / Hachmannplatz,Hamburg,11.216667
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,116810,Hudtwalckerstraße / Bebelallee,Kellinghusenstraße / Loogeplatz,Hamburg,75.383333
4,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,120345,Bahnhof Dammtor Nord // Theodor-Heuss-Platz,Feldstraße / Marktstraße,Hamburg,8.983333


In [6]:
sum_df.shape

(2000000, 17)