In [11]:
import pandas as pd
import glob

In [12]:
path_pattern = '../Raw_Files/*.csv'

In [13]:
csv_files = glob.glob(path_pattern)

In [14]:
first_file = csv_files[0] if csv_files else None
df_first = pd.read_csv(first_file, low_memory=False)
df_first.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,58F2CA262B50E256,classic_bike,2024-01-25 20:39:09,2024-01-25 20:44:07,Broadway & E 14 St,5905.12,Ave A & E 11 St,5703.13,40.734546,-73.990741,40.728547,-73.981759,member
1,AA7AB6D6E9F8D21B,classic_bike,2024-01-15 18:44:36,2024-01-15 19:19:46,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
2,1830A6C4BA1E1A9D,classic_bike,2024-01-03 19:27:58,2024-01-03 19:58:42,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
3,3995B084A51A1038,classic_bike,2024-01-22 18:29:46,2024-01-22 18:59:57,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
4,23EE6A8979C333B1,classic_bike,2024-01-27 09:55:39,2024-01-27 10:00:48,E 16 St & Irving Pl,5938.11,E 14 St & 1 Ave,5779.1,40.735367,-73.987974,40.731393,-73.982867,member


In [15]:
df_first.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   ride_id             1000000 non-null  object 
 1   rideable_type       1000000 non-null  object 
 2   started_at          1000000 non-null  object 
 3   ended_at            1000000 non-null  object 
 4   start_station_name  999606 non-null   object 
 5   start_station_id    999606 non-null   object 
 6   end_station_name    1000000 non-null  object 
 7   end_station_id      1000000 non-null  object 
 8   start_lat           1000000 non-null  float64
 9   start_lng           1000000 non-null  float64
 10  end_lat             1000000 non-null  float64
 11  end_lng             1000000 non-null  float64
 12  member_casual       1000000 non-null  object 
dtypes: float64(4), object(9)
memory usage: 99.2+ MB


In [16]:
cleaned_dfs = []

In [17]:
for file in csv_files:
    # Read the csv file, setting low_memory to False
    df = pd.read_csv(file, low_memory=False)
    
    # Convert 'started_at' and 'ended_at' to datetime
    df['started_at'] = pd.to_datetime(df['started_at'], errors='coerce')
    df['ended_at'] = pd.to_datetime(df['ended_at'], errors='coerce')
    
    # Ensure station IDs are strings
    df['start_station_id'] = df['start_station_id'].astype(str)
    df['end_station_id'] = df['end_station_id'].astype(str)
    
    # Drop rows where 'start_station_name' or 'start_station_id' is missing
    df.dropna(subset=['start_station_name', 'start_station_id'], inplace=True)
    
    # Append the cleaned dataframe to the list
    cleaned_dfs.append(df)

# Concatenate all the cleaned dataframes
master_df = pd.concat(cleaned_dfs, ignore_index=True)

In [18]:
master_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,58F2CA262B50E256,classic_bike,2024-01-25 20:39:09,2024-01-25 20:44:07,Broadway & E 14 St,5905.12,Ave A & E 11 St,5703.13,40.734546,-73.990741,40.728547,-73.981759,member
1,AA7AB6D6E9F8D21B,classic_bike,2024-01-15 18:44:36,2024-01-15 19:19:46,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
2,1830A6C4BA1E1A9D,classic_bike,2024-01-03 19:27:58,2024-01-03 19:58:42,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
3,3995B084A51A1038,classic_bike,2024-01-22 18:29:46,2024-01-22 18:59:57,E 16 St & Irving Pl,5938.11,Clermont Ave & Park Ave,4692.01,40.735367,-73.987974,40.695734,-73.971297,member
4,23EE6A8979C333B1,classic_bike,2024-01-27 09:55:39,2024-01-27 10:00:48,E 16 St & Irving Pl,5938.11,E 14 St & 1 Ave,5779.1,40.735367,-73.987974,40.731393,-73.982867,member


In [19]:
master_df.shape

(6862721, 13)

In [20]:
master_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6862721 entries, 0 to 6862720
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
dtypes: datetime64[ns](2), float64(4), object(7)
memory usage: 680.7+ MB


In [21]:
master_csv_path = '../Cleaned_Files/csw1nk_master.csv'
master_df.to_csv(master_csv_path, index=False)

print(f"Master CSV created at {master_csv_path}")

Master CSV created at ../Cleaned_Files/csw1nk_master.csv
