In [1]:
# important Dependencies
import feather
import pandas as pd
import numpy as np
import os  # Add Indirect Path to file (not needed in Direct Path)

In [2]:
# csv files to loaded from a resources folder using indirect path
citibike_data_to_load = os.path.join("Datasources", "201908-citibike-tripdata.csv")

In [3]:
# 1. Create a DataFrame for the 201908-citibike-tripdata data. 
# read the data file and store it in a Pandas Data Frame
citibike_data_df = pd.read_csv(citibike_data_to_load)
# test if DataFrame is working by printing part of the DataFrame
citibike_data_df.sample(n=2)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
1380821,1140,2019-08-19 01:30:43.0240,2019-08-19 01:49:43.7800,3086.0,Graham Ave & Conselyea St,40.715143,-73.944507,3777.0,Stockholm St & Wilson Ave,40.699304,-73.923044,19909,Customer,1997,2
1116229,1042,2019-08-15 14:21:39.4220,2019-08-15 14:39:02.3210,3557.0,40 Ave & 9 St,40.75742,-73.945133,3602.0,31 Ave & 34 St,40.763154,-73.920827,15062,Subscriber,1985,2


In [4]:
# 2. Check the datatypes of your columns. 
citibike_data_df.dtypes

tripduration                 int64
starttime                   object
stoptime                    object
start station id           float64
start station name          object
start station latitude     float64
start station longitude    float64
end station id             float64
end station name            object
end station latitude       float64
end station longitude      float64
bikeid                       int64
usertype                    object
birth year                   int64
gender                       int64
dtype: object

In [5]:
# 3. Convert the 'tripduration' column to datetime datatype.
tripduration_datetime = pd.to_datetime(citibike_data_df['tripduration'], unit="s")

In [6]:
# create new dataframe
tripduration_dt = pd.DataFrame(tripduration_datetime)
tripduration_dt.head(2)

Unnamed: 0,tripduration
0,1970-01-01 00:06:33
1,1970-01-01 00:10:27


In [7]:
# replace tripduration dataframes
citibike_data_df["tripduration"] = tripduration_dt["tripduration"]
# Confirm that the converted values in the "tripduration" column match
citibike_data_df.sample(n=3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
75581,1970-01-01 00:12:07,2019-08-01 20:11:02.2870,2019-08-01 20:23:10.0490,304.0,Broadway & Battery Pl,40.704633,-74.013617,328.0,Watts St & Greenwich St,40.724055,-74.00966,18036,Subscriber,1992,1
506452,1970-01-01 00:09:24,2019-08-07 11:44:07.8200,2019-08-07 11:53:32.6320,3156.0,E 72 St & York Ave,40.766638,-73.953483,3137.0,5 Ave & E 73 St,40.772828,-73.966853,29402,Subscriber,1969,1
1577362,1970-01-01 00:31:49,2019-08-21 16:02:46.1110,2019-08-21 16:34:35.7580,3158.0,W 63 St & Broadway,40.771639,-73.982614,3336.0,E 97 St & Madison Ave,40.787801,-73.953559,26538,Customer,1995,2


In [8]:
# 4. Check the datatypes of your columns. 
citibike_data_df.dtypes

tripduration               datetime64[ns]
starttime                          object
stoptime                           object
start station id                  float64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                    float64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                              int64
dtype: object

In [9]:
# 5. Export the Dataframe as a new CSV file without the index.
citibike_data_df.to_csv(r'C:\Users\Daniel Brock\OneDrive\Desktop\DataAnalyticsBootcamp\Module14_Tableau_NY_Citibikes\Tableau_NY_Citibike\Challenge\Datasources\201908-citibike-tripdata-update.csv', header=True, index=False)
# citibike_data_df.to_csv(r'C:\Users\Daniel Brock\OneDrive\Desktop\DataAnalyticsBootcamp\Module14_Tableau_NY_Citibikes\Tableau_NY_Citibike\Challenge\Datasources\201908-citibike-tripdata-update.csv', header=True, index=False, sep=' ')

In [10]:
# test feather because its faster
feather.write_dataframe(citibike_data_df, '201908-citibike-tripdata.feather')