Imports

In [71]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn import linear_model

pd.options.mode.chained_assignment = None

Load data

In [72]:
train = pd.read_csv('bicikelj_train.csv')
test = pd.read_csv('bicikelj_test.csv')

Feature creation

In [73]:
# Convert timestamp to pandas datetime
train["timestamp"] = [pd.to_datetime(ts).tz_localize(None) for ts in train["timestamp"].values]
test["timestamp"] = [pd.to_datetime(ts).tz_localize(None) for ts in test["timestamp"].values]

# Save timestamps of train set to a separate list
times = train["timestamp"].values

# Save classes to a separate list
classes = train.columns[1:]

In [74]:
# Gets the closes time to t in the train set
def get_closest_time(t):
    closest = np.argmin(np.abs(times - np.full_like(times, t)))
    return int(closest)

Create features for timestamp difference

In [75]:
# Remove train samples that have a break that is too big
for t in train['timestamp']:
    minus60 = t - pd.Timedelta(minutes=60)
    minus120 = t - pd.Timedelta(minutes=120)

    closest_60 = np.abs(minus60 - times[get_closest_time(minus60)]).total_seconds() / 60
    closest_120 = np.abs(minus120 - times[get_closest_time(minus120)]).total_seconds() / 60

    if closest_60 > 15 or closest_120 > 15:
        train = train[train['timestamp'] != t]

# Tag test samples that have a 2 hour break
test["is_two_hour_break"] = 0
for t in test['timestamp']:
    minus60 = t - pd.Timedelta(minutes=60)
    minus120 = t - pd.Timedelta(minutes=120)

    closest_60 = np.abs(minus60 - times[get_closest_time(minus60)]).total_seconds() / 60
    closest_120 = np.abs(minus120 - times[get_closest_time(minus120)]).total_seconds() / 60

    if closest_60 > 15:
        test.loc[test['timestamp'] == t, 'is_two_hour_break'] = 1

In [76]:
# Save timestamps of train set to a separate list
times = train["timestamp"].values

In [77]:

print("Creating features for train set...")
cnt = 0

# Add new columns to train set
train['hour'] = 0
train['weekday'] = 0
train['bikes_before_60'] = 0
train['bikes_before_90'] = 0
train['bikes_before_120'] = 0

# Add new columns to test set
test['hour'] = 0
test['weekday'] = 0
test['bikes_before_60'] = 0
test['bikes_before_90'] = 0
test['bikes_before_120'] = 0

for t in train['timestamp']:
    cnt += 1
    print(cnt, '/', str(len(train['timestamp'])), end='\r')

    # Extract row
    row = train[train['timestamp'] == t]

    # Create new features
    row['hour'] = t.hour
    row['weekday'] = t.weekday()

    row['bikes_before_60'] = get_closest_time(t - pd.Timedelta(minutes=60))
    row['bikes_before_90'] = get_closest_time(t - pd.Timedelta(minutes=90))
    row['bikes_before_120'] = get_closest_time(t - pd.Timedelta(minutes=120))

    # Insert the new row
    train.loc[train['timestamp'] == t] = row

print("Creating features for test set...")
cnt = 0
for t in test['timestamp']:
    cnt += 1
    print(cnt, '/', str(len(test['timestamp'])), end='\r')

    # Extract row
    row = test[test['timestamp'] == t]

    # Create new features
    row['hour'] = t.hour
    row['weekday'] = t.weekday()

    row['bikes_before_60'] = get_closest_time(t - pd.Timedelta(minutes=60))
    row['bikes_before_90'] = get_closest_time(t - pd.Timedelta(minutes=90))
    row['bikes_before_120'] = get_closest_time(t - pd.Timedelta(minutes=120))

    # Insert the new row
    test.loc[test['timestamp'] == t] = row

print("Done!")

Creating features for train set...
Creating features for test set...
Done!40


One-hot-encoding

In [78]:
# Use one-hot-encoding for hour and weekday
train = pd.get_dummies(train, columns=['hour', 'weekday'], drop_first=True)
test = pd.get_dummies(test, columns=['hour', 'weekday'], drop_first=True)

# Insert missing columns to test set
def test_insert_column_after(name, after):
    test[name] = 0
    hour_prev_i = test.columns.get_loc(after)
    test.insert(hour_prev_i + 1, name, test.pop(name))

test_insert_column_after("hour_1", "bikes_before_120")
test_insert_column_after("hour_2", "hour_1")


Training and predicting

In [79]:
cnt = 0

# Create a new model for each Bicikelj station
for c in classes:
    cnt += 1

    print('                                                                                                                              ', end='\r')
    print(cnt, '/', str(len(classes)), ' ', c, end='\r')

    # Extract train input and output variables
    X = train[train.columns[-32:]]
    y = train[c]

    # Find the numbers of bikes based on the times
    bikes_before_60 = []
    bikes_before_90 = []
    bikes_before_120 = []

    for i in range(0, len(X)):
        bikes_before_60.append(train.iloc[int(X.iloc[i]['bikes_before_60'])][c])
        bikes_before_90.append(train.iloc[int(X.iloc[i]['bikes_before_90'])][c])
        bikes_before_120.append(train.iloc[int(X.iloc[i]['bikes_before_120'])][c])

    X.loc[:, 'bikes_before_60'] = bikes_before_60
    X.loc[:, 'bikes_before_90'] = bikes_before_90
    X.loc[:, 'bikes_before_120'] = bikes_before_120

    # Extract test input variables
    X_test = test[test.columns[-32:]]

    # Find the numbers of bikes based on the times
    bikes_before_60 = []
    bikes_before_90 = []
    bikes_before_120 = []

    for i in range(0, len(X_test)):
        bikes_before_60.append(train.iloc[int(X_test.iloc[i]['bikes_before_60'])][c])
        bikes_before_90.append(train.iloc[int(X_test.iloc[i]['bikes_before_90'])][c])
        bikes_before_120.append(train.iloc[int(X_test.iloc[i]['bikes_before_120'])][c])

    X_test.loc[:, 'bikes_before_60'] = bikes_before_60
    X_test.loc[:, 'bikes_before_90'] = bikes_before_90
    X_test.loc[:, 'bikes_before_120'] = bikes_before_120

    # Create and train the model
    model = linear_model.LinearRegression()
    model.fit(X, y)

    # Remove the columns that are not needed for the second model
    X = X.drop(['bikes_before_60', 'bikes_before_90'], axis=1)

    model_120 = linear_model.LinearRegression()
    model_120.fit(X, y)

    # Predict the test set
    y_pred = model.predict(X_test)

    X_test = X_test.drop(['bikes_before_60', 'bikes_before_90'], axis=1)
    y_pred_120 = model_120.predict(X_test)

    y_pred_final = []

    # Put in y_pred_120 where "is_two_hour_break" is 1
    for i in range(0, len(y_pred)):
        if test.iloc[i]['is_two_hour_break'] == 1:
            y_pred_final.append(y_pred_120[i])
        else:
            y_pred_final.append(y_pred[i])

    # Set negative values to 0
    for j in range(len(y_pred_final)):
        if y_pred_final[j] < 0:
            y_pred_final[j] = 0

    test[c] = y_pred_final



83 / 83   SUPERNOVA LJUBLJANA - RUDNIK                                                                                        

Remove unnecesary columns from the test set

In [80]:
test_out = test.drop(['bikes_before_60', 'bikes_before_90', 'bikes_before_120', 'is_two_hour_break'], axis=1)

for i in range(1, 24):
    test_out = test_out.drop('hour_' + str(i), axis=1)

for i in range(1, 7):
    test_out = test_out.drop('weekday_' + str(i), axis=1)


Save the result

In [81]:
now = datetime.now()

test_out.to_csv('bicikelj_out_' + now.strftime("%d_%m-%H-%M-%S") + '.csv', index=False)