In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [None]:
# from pymongo import MongoClient
# import pandas as pd

# def fetch_data_from_mongo(db_name, collection_name, uri="mongodb://localhost:27017/"):
#     """
#     Connect to a local MongoDB instance, retrieve data from the specified collection,
#     and convert it to a DataFrame.

#     Parameters:
#         db_name (str): Name of the MongoDB database.
#         collection_name (str): Name of the collection within the database.
#         uri (str): MongoDB connection URI, default is "mongodb://localhost:27017/".

#     Returns:
#         DataFrame: Data from the collection in pandas DataFrame format, or None if no data is available.
#     """
#     # Connect to MongoDB
#     client = MongoClient(uri)
#     db = client[db_name]
#     collection = db[collection_name]

#     # Check if any data is available
#     if collection.count_documents({}) > 0:
#         print("Data Available")
#         # Read data from the collection
#         documents = collection.find()

#         # Convert to DataFrame
#         df = pd.DataFrame(documents)
#         print("Converted to DataFrame Successfully")
#         return df
#     else:
#         print("No Data Available")
#         return None

# # Example usage
# df = fetch_data_from_mongo(db_name="metro_data", collection_name="7_211ObjEvent")

# if df is not None:
#     print(df.head())  # Display the first few rows of the DataFrame


In [2]:
import pandas as pd

def load_and_process_data(path):

    # Load the data
    df = pd.read_csv(path)
    print("Data has been loaded successfully")

    # Ensure 'Dt' is in datetime format with timezone handling
    df['Dt'] = pd.to_datetime(df['Dt'], utc=True, errors='coerce')

    # Extract desired time components
    df['hour'] = df['Dt'].dt.hour
    df['day'] = df['Dt'].dt.day
    df['month'] = df['Dt'].dt.month
    df['year'] = df['Dt'].dt.year
    df['day_of_week'] = df['Dt'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    print("Date converted successfully")
    return df
path = r"/content/drive/MyDrive/python_script/7_211_objEvent.csv"
df = load_and_process_data(path)



Data has been loaded successfully
Date converted successfully


In [3]:
import pandas as pd

def data_preprocessing(df):

    # Check for NaN values
    missing_data = df.isnull().sum()
    print("Missing values in each column before dropping nulls:\n", missing_data)

    # Drop rows with any NaN values
    df_cleaned = df.dropna()

    # Confirm removal of null values
    print("Data after dropping null values.")
    print("Missing values in each column after dropping nulls:\n", df_cleaned.isnull().sum())

    return df_cleaned

df_cleaned = data_preprocessing(df)

Missing values in each column before dropping nulls:
 BD             0
DSM            0
DSM2           0
Dt             0
ESN            0
EqN            0
EqT            0
EsT            0
Line           0
Lvl            0
Reference      0
St             0
Sta            0
Tag            0
_id            0
hour           0
day            0
month          0
year           0
day_of_week    0
is_weekend     0
dtype: int64
Data after dropping null values.
Missing values in each column after dropping nulls:
 BD             0
DSM            0
DSM2           0
Dt             0
ESN            0
EqN            0
EqT            0
EsT            0
Line           0
Lvl            0
Reference      0
St             0
Sta            0
Tag            0
_id            0
hour           0
day            0
month          0
year           0
day_of_week    0
is_weekend     0
dtype: int64


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from statsmodels.tsa.seasonal import seasonal_decompose

# 1. Function to load dataset in chunks with the given time format
def load_data_in_chunks(file_path, chunk_size=100000, time_format='%Y-%m-%dT%H:%M:%S.%fZ'):
    chunks = pd.read_csv(file_path, chunksize=chunk_size, parse_dates=['Dt'], date_parser=lambda x: datetime.strptime(x, time_format))
    return chunks

# 2. Filter out data till yesterday
def filter_data_up_to_yesterday(df):
    yesterday = datetime.now() - timedelta(1)  # Yesterday's date
    return df[df['Dt'] < yesterday]

# 3. Group by 'Sta' and count maximum entries
def get_max_sta(df):
    sta_count = df.groupby('Sta').size()
    max_sta = sta_count.idxmax()  # Station with the max entries
    return max_sta, sta_count[max_sta]

# 4. Save CSV based on the station with maximum entries
def save_csv_for_max_sta(df, max_sta):
    max_sta_data = df[df['Sta'] == max_sta]
    file_name = f"{max_sta}.csv"
    max_sta_data.to_csv(file_name, index=False)
    return file_name

# 5. Find missing date ranges
def find_missing_dates(df, freq='D'):
    df_sorted = df.sort_values('Dt')
    full_range = pd.date_range(start=df_sorted['Dt'].min(), end=df_sorted['Dt'].max(), freq=freq)
    missing_dates = full_range.difference(df_sorted['Dt'])
    return missing_dates

# 6. Delete data before large missing date gap
def remove_data_before_large_gap(df, threshold=3):
    missing_dates = find_missing_dates(df)
    if len(missing_dates) > threshold:
        first_missing_date = missing_dates[0]
        df = df[df['Dt'] >= first_missing_date]
    return df

# 7. Handle small missing date gaps using seasonal decomposition
def fill_missing_using_seasonal_decomposition(df):
    df.set_index('Dt', inplace=True)
    result = seasonal_decompose(df['Sta'], model='additive', period=365)  # Assuming yearly seasonality
    df['Sta'] = df['Sta'].fillna(result.trend + result.seasonal)
    return df

# 8. Display first 10 entries of the saved CSV
def display_first_10_entries(file_name):
    df = pd.read_csv(file_name)
    return df.head(10)

# Main process to execute the steps
def process_ridership_data(file_path):
    # Load data in chunks
    chunks = load_data_in_chunks(file_path)

    # Combine all chunks into a single DataFrame
    df_combined = pd.concat(chunks, ignore_index=True)

    # Filter data to include only up to yesterday
    df_filtered = filter_data_up_to_yesterday(df_combined)

    # Get the station with the maximum entries
    max_sta, max_sta_count = get_max_sta(df_filtered)

    # Save CSV for the max sta
    file_name = save_csv_for_max_sta(df_filtered, max_sta)

    # Handle missing date ranges (Thresholds and seasonal decomposition)
    df_filtered = remove_data_before_large_gap(df_filtered, threshold=3)###################################################################increase threshold to 180
    df_filtered = fill_missing_using_seasonal_decomposition(df_filtered)

    # Display the first 10 entries of the saved CSV
    first_10_entries = display_first_10_entries(file_name)

    return first_10_entries

# Call the function with your file path
file_path = r'/content/drive/MyDrive/python_script/7_211_objEvent.csv'
first_10 = process_ridership_data(file_path)
print(first_10)


  chunks = pd.read_csv(file_path, chunksize=chunk_size, parse_dates=['Dt'], date_parser=lambda x: datetime.strptime(x, time_format))
