- This Jupyter Notebook contains the <span style="color:blue">*cleaning_stationdata*</span> function that cleans the stations.csv file in order to do proper dock station analysis

In [10]:
# Import libraries
import pandas as pd
import numpy as np
import re

In [11]:
# Read the merged stations.csv file. Note stationdata_process.ipynb for more information
stations = pd.read_csv('../data/stationdata/stations.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
stations_cleaning = stations.copy()

In [5]:
stations_cleaning.head(10)

Unnamed: 0,dock_id,dock_name,date,hour,minute,pm,avail_bikes,avail_docks,tot_docks,_lat,_long,in_service,status_key
0,72,"""W 52 St & 11 Ave""","""15-03-01""",1,6,0,8,31,39,40.7673,-73.9939,1,1
1,72,"""W 52 St & 11 Ave""","""15-03-01""",1,16,0,8,31,39,40.7673,-73.9939,1,1
2,72,"""W 52 St & 11 Ave""","""15-03-01""",1,25,0,8,30,38,40.7673,-73.9939,1,1
3,72,"""W 52 St & 11 Ave""","""15-03-01""",1,35,0,8,30,38,40.7673,-73.9939,1,1
4,72,"""W 52 St & 11 Ave""","""15-03-01""",1,45,0,8,30,38,40.7673,-73.9939,1,1
5,72,"""W 52 St & 11 Ave""","""15-03-01""",1,54,0,7,31,38,40.7673,-73.9939,1,1
6,72,"""W 52 St & 11 Ave""","""15-03-01""",2,2,0,8,31,39,40.7673,-73.9939,1,1
7,72,"""W 52 St & 11 Ave""","""15-03-01""",2,13,0,8,31,39,40.7673,-73.9939,1,1
8,72,"""W 52 St & 11 Ave""","""15-03-01""",2,22,0,8,31,39,40.7673,-73.9939,1,1
9,72,"""W 52 St & 11 Ave""","""15-03-01""",2,32,0,8,31,39,40.7673,-73.9939,1,1


In [6]:
stations_cleaning.isnull().sum()

dock_id        0
dock_name      0
date           0
hour           0
minute         0
pm             0
avail_bikes    3
avail_docks    0
tot_docks      1
_lat           2
_long          2
in_service     0
status_key     2
dtype: int64

In [12]:
def cleaning_stationdata(df):
    """This function will clean the station data from that was merged in Linux command lines.
    The data cleaning function contains several new features including day of week, hour, and dock status.
    Dock status is calculated as available bikes / total docks. Running this function will ultimately provide
    a combined csv file called stations_cleaned.csv
    """
    
    
    df.dropna(inplace = True)
    df.drop(df[df['dock_id'].apply(lambda x: isinstance(x, str))].index, inplace = True)
    df = df[df['tot_docks'] < 500]
    
    
    mask = ~df['avail_bikes'].astype(str).str.contains('[A-z]')
    df = df[mask]
    
    
    # Remove quotation marks
    df['avail_bikes'] = df['avail_bikes'].apply(lambda x: re.sub("\"", "", str(x)))
    # Drop empty values
    df = df[df['avail_bikes'] != ""]
    # Convert strings to integers
    df['avail_bikes'] = df['avail_bikes'].astype(float).astype(int)
    # Remove any row with an impossible number of bikes
    df = df[df['avail_bikes'] <= 200]
    
    mask = ~df['avail_docks'].astype(str).str.contains('[A-z]')
    df = df[mask]

    
    # Remove quotation marks 
    df['avail_docks'] = df['avail_docks'].apply(lambda x: re.sub("\"", "", str(x)))
    # Drop empty values
    df = df[df['avail_docks'] != ""]
    # Convert strings to integers
    df['avail_docks'] = df['avail_docks'].astype(float).astype(int)
    # Remove number of available docks that are higher than 200
    df = df[df['avail_docks'] <= 200]
    
    
    # Parse date column into datetime format
    df['date'] = pd.to_datetime(df['date'], format = '"%y-%m-%d"')

    
    # Convert numeric columns from strings to integers
    df['dock_id'] = df['dock_id'].astype(int)
    df['tot_docks'] = df['tot_docks'].astype(int)
    df['minute'] = df['minute'].astype(int)
    
    # Clean up latitude and longitude columns
    df['_lat'] = df['_lat'].apply(lambda x: float(re.sub('\"', "", str(x))))
    df['_long'] = df['_long'].apply(lambda x: re.sub('[^-^.0-9]', "", str(x))).apply(lambda x: re.sub("-{2}", "-", str(x)))
    df = df[df['_long'] != ""]
    df['_long'].astype(float)
    
    
    # Clean up hours column
    df['hour'] = df['hour'].apply(lambda x: re.sub('[^0-9]', "", str(x))).astype(int)
    
    # Convert hours to 24-hour time
    df['hour'].loc[df['pm'] == 1] = df['hour'].loc[df['pm'] == 1] + 12

    # Remove quotations from dock name
    df['dock_name'] = df['dock_name'].apply(lambda x: str(re.sub('\"', "", x)))


    # Create a dock status column
    df['dock_status'] = (df['avail_bikes']/df['tot_docks']).apply(lambda x: "Full Alert" if x > 7/10 else "Empty Alert" if x < 3/10 else "Healthy")


    # Drop unnecessary columns
    df.drop(['pm'], axis = 1, inplace = True)


    # Create new variables -> time, day of the week, and season
    
    # time variable
    df = df.assign(time = lambda x: x['hour'].astype(str) + ":" + x['minute'].astype(str))
    # day of the week variable
    
    df = df.assign(dayofweek = lambda x: x['date'].dt.weekday)
    
    df['dayofweek2'] = df['dayofweek'].apply(lambda x: 'Monday' if x == 0 else 'Tuesday' if x == 1\
    else 'Wednesday' if x == 2 else 'Thursday' if x == 3 else 'Friday' if x == 4 else 'Saturday' if x == 5 else\
    'Sunday')
    
    # season variable
    df = df.assign(\
        season = lambda x: x['date'].dt.month.apply(\
        lambda y: 'winter' if y <= 2 else 'spring' if y <= 5 else 'summer' if y <= 8 else 'fall' if y <= 11 else 'winter'))
    
    # Create a new csv file
    df.to_csv("../data/stations_cleaned.csv", index = False)

In [13]:
cleaning_stationdata(stations)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [4]:
stations_df.head(10)

Unnamed: 0,dock_id,dock_name,date,hour,minute,avail_bikes,avail_docks,tot_docks,_lat,_long,in_service,status_key,dock_status,time,dayofweek,season
0,72,W 52 St & 11 Ave,2015-03-01,1,6,8,31,39,40.767272,-73.993929,1,1,Empty Alert,1:6,6,spring
1,72,W 52 St & 11 Ave,2015-03-01,1,16,8,31,39,40.767272,-73.993929,1,1,Empty Alert,1:16,6,spring
2,72,W 52 St & 11 Ave,2015-03-01,1,25,8,30,38,40.767272,-73.993929,1,1,Empty Alert,1:25,6,spring
3,72,W 52 St & 11 Ave,2015-03-01,1,35,8,30,38,40.767272,-73.993929,1,1,Empty Alert,1:35,6,spring
4,72,W 52 St & 11 Ave,2015-03-01,1,45,8,30,38,40.767272,-73.993929,1,1,Empty Alert,1:45,6,spring
5,72,W 52 St & 11 Ave,2015-03-01,1,54,7,31,38,40.767272,-73.993929,1,1,Empty Alert,1:54,6,spring
6,72,W 52 St & 11 Ave,2015-03-01,2,2,8,31,39,40.767272,-73.993929,1,1,Empty Alert,2:2,6,spring
7,72,W 52 St & 11 Ave,2015-03-01,2,13,8,31,39,40.767272,-73.993929,1,1,Empty Alert,2:13,6,spring
8,72,W 52 St & 11 Ave,2015-03-01,2,22,8,31,39,40.767272,-73.993929,1,1,Empty Alert,2:22,6,spring
9,72,W 52 St & 11 Ave,2015-03-01,2,32,8,31,39,40.767272,-73.993929,1,1,Empty Alert,2:32,6,spring
