In [2]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()

In [3]:
import pandas as pd
from autogluon.tabular import TabularPredictor

### Data

datetime - hourly date + timestamp
season -  1 = spring, 2 = summer, 3 = fall, 4 = winter
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals

In [41]:
# Create the train dataset in pandas by reading the csv
# Set the parsing of the datetime column so you can use some of the `dt` features in pandas later
train = pd.read_csv("./Data/train.csv")
train['datetime'] = pd.to_datetime(train['datetime'])
train.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [42]:
train.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
count,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0,10886.0
mean,2.506614,0.028569,0.680875,1.418427,20.23086,23.655084,61.88646,12.799395,36.021955,155.552177,191.574132
std,1.116174,0.166599,0.466159,0.633839,7.79159,8.474601,19.245033,8.164537,49.960477,151.039033,181.144454
min,1.0,0.0,0.0,1.0,0.82,0.76,0.0,0.0,0.0,0.0,1.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,47.0,7.0015,4.0,36.0,42.0
50%,3.0,0.0,1.0,1.0,20.5,24.24,62.0,12.998,17.0,118.0,145.0
75%,4.0,0.0,1.0,2.0,26.24,31.06,77.0,16.9979,49.0,222.0,284.0
max,4.0,1.0,1.0,4.0,41.0,45.455,100.0,56.9969,367.0,886.0,977.0


In [43]:
# Create the test pandas dataframe in pandas by reading the csv, remember to parse the datetime!
test = pd.read_csv("./Data/test.csv")
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [63]:
test.describe()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
count,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0,6493.0
mean,2.4933,0.029108,0.685815,1.436778,20.620607,24.012865,64.125212,12.631157
std,1.091258,0.168123,0.464226,0.64839,8.059583,8.782741,19.293391,8.250151
min,1.0,0.0,0.0,1.0,0.82,0.0,16.0,0.0
25%,2.0,0.0,0.0,1.0,13.94,16.665,49.0,7.0015
50%,3.0,0.0,1.0,1.0,21.32,25.0,65.0,11.0014
75%,3.0,0.0,1.0,2.0,27.06,31.06,81.0,16.9979
max,4.0,1.0,1.0,4.0,40.18,50.0,100.0,55.9986


In [44]:
# Same thing as train and test dataset
submission = pd.read_csv("./Data/sampleSubmission.csv")
submission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,0
1,2011-01-20 01:00:00,0
2,2011-01-20 02:00:00,0
3,2011-01-20 03:00:00,0
4,2011-01-20 04:00:00,0


In [64]:
submission.describe()

Unnamed: 0,count
count,6493.0
mean,100.389969
std,89.943405
min,3.319188
25%,19.963657
50%,63.284554
75%,169.917923
max,364.556915


### Model

Requirements:
- We are predicting count, so it is the label we are setting.
- Ignore casual and registered columns as they are also not present in the test dataset.
- Use the root_mean_squared_error as the metric to use for evaluation.
- Set a time limit of 10 minutes (600 seconds).
- Use the preset best_quality to focus on creating the best model.

In [45]:
train = train.drop(columns=['casual', 'registered'])

In [69]:
train.head(100)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,1
...,...,...,...,...,...,...,...,...,...,...
95,2011-01-05 04:00:00,1,0,1,1,9.84,11.365,48,15.0013,2
96,2011-01-05 05:00:00,1,0,1,1,9.02,11.365,47,11.0014,3
97,2011-01-05 06:00:00,1,0,1,1,8.20,9.850,47,15.0013,33
98,2011-01-05 07:00:00,1,0,1,1,7.38,9.090,43,12.9980,88


In [47]:
predictor = TabularPredictor(label="count", problem_type="regression", eval_metric="root_mean_squared_error").fit(
    train_data = train,
    time_limit = 600,
    presets = "best_quality"
)

No path specified. Models will be saved in: "AutogluonModels/ag-20220718_210448\"
Presets specified: ['best_quality']
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels/ag-20220718_210448\"
AutoGluon Version:  0.5.0
Python Version:     3.9.12
Operating System:   Windows
Train Data Rows:    10886
Train Data Columns: 9
Label Column: count
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3483.84 MB
	Train Data (Original)  Memory Usage: 0.78 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting Identit

In [51]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                     model   score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3  -52.660090      26.574171  393.039987                0.000000           0.419692            3       True         16
1   RandomForestMSE_BAG_L2  -53.261897      25.019058  375.809003                1.656839          18.757193            2       True         12
2     ExtraTreesMSE_BAG_L2  -53.715353      24.336632  360.576727                0.974414           3.524917            2       True         14
3          LightGBM_BAG_L2  -54.775500      23.942918  370.338185                0.580699          13.286374            2       True         11
4           XGBoost_BAG_L2  -55.155336      23.520211  365.161487                0.157993           8.109677            2       True         15
5          CatBoost_BAG_L2  -55.426486      23.586525  401.432197         

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L1': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L1': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L1': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L1': 'StackerEnsembleModel_XT',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'LightGBMXT_BAG_L2': 'StackerEnsembleModel_LGB',
  'LightGBM_BAG_L2': 'StackerEnsembleModel_LGB',
  'RandomForestMSE_BAG_L2': 'StackerEnsembleModel_RF',
  'CatBoost_BAG_L2': 'StackerEnsembleModel_CatBoost',
  'ExtraTreesMSE_BAG_L2': 'StackerEnsembleModel_XT',
  'XGBoost_BAG_L2': 'StackerEnsembleModel_XGBoost',
  'WeightedEnsemble_L3': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif_BAG_L1': -101.58817625927213,
  'KNeighborsDist_BAG_L1': -84.14642264302962,
  'LightGBMXT_

In [60]:
predictions = predictor.predict(test)

AttributeError: 'Series' object has no attribute 'length'

In [62]:
predictions.head()

0    24.014580
1    41.035740
2    46.139664
3    48.773586
4    50.845512
Name: count, dtype: float32

In [56]:
predictions.describe()

count    6493.000000
mean      100.389969
std        89.943405
min         3.319188
25%        19.963657
50%        63.284554
75%       169.917923
max       364.556915
Name: count, dtype: float64

In [65]:
submission["count"] = predictions
submission.to_csv("submission.csv", index=False)

In [67]:
submission.head(100)

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,24.014580
1,2011-01-20 01:00:00,41.035740
2,2011-01-20 02:00:00,46.139664
3,2011-01-20 03:00:00,48.773586
4,2011-01-20 04:00:00,50.845512
...,...,...
95,2011-01-24 01:00:00,90.008713
96,2011-01-24 03:00:00,91.875603
97,2011-01-24 04:00:00,72.840492
98,2011-01-24 05:00:00,72.786636


In [70]:
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "first raw submission attempt: 2"

Successfully submitted to Bike Sharing Demand



  0%|          | 0.00/195k [00:00<?, ?B/s]
100%|##########| 195k/195k [00:00<00:00, 417kB/s]


In [59]:
# first raw score of

'tail' is not recognized as an internal or external command,
operable program or batch file.
