In [None]:
import io
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

### All file in a folder

In [None]:
folder_path = '/home/jovyan/Data/2020OD/del_3h/IN'

file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
df_list = []

# Loop through the list of files and read each file into a DataFrame
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path) 
    df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(df_list, ignore_index=True)
combined_df

In [None]:
combined_df['trip_count'].sum()

In [None]:
combined_df['local_date'] = pd.to_datetime(combined_df['local_date']).dt.strftime('%Y%m%d')
combined_df

### Select files

In [None]:
file1 = '/home/jovyan/Data/2020OD/del_3h/od_in_agg5_3h_1.csv'
df1 = pd.read_csv(file1)
# df1['local_date'] = df1['local_date'].astype(int)
df1

In [None]:
# len(final_df.loc[final_df['local_date'] == 20190203])
# sub_set1 = df1.loc[(df1['local_date'] >= 20190601) & (df1['local_date'] <= 20190631)]
# sub_set1

In [None]:
file2 = '/home/jovyan/Data/TJ/3h/od_mx2_agg3_3h.csv'
df2 = pd.read_csv(file2)
df2['local_date'] = df2['local_date'].astype(int)
df2

In [None]:
combined_df = pd.concat([df1, df2], ignore_index=True)
combined_df

## Check duplicates

In [None]:
# combined_df['start_geohash3'] = combined_df['start_geohash3'].astype(str)
# combined_df['end_geohash3'] = combined_df['end_geohash3'].astype(str)

combined_df['start_geohash5'] = combined_df['start_geohash5'].astype(str)
combined_df['end_geohash5'] = combined_df['end_geohash5'].astype(str)

combined_df['local_date'] = combined_df['local_date'].astype(int)
combined_df['trip_count'] = combined_df['trip_count'].astype(int)
combined_df['m_duration_min'] = combined_df['m_duration_min'].astype(float)
combined_df['mdn_duration_min'] = combined_df['mdn_duration_min'].astype(float)
combined_df['sd_duration_min'] = combined_df['sd_duration_min'].astype(float)
combined_df['m_length_m'] = combined_df['m_length_m'].astype(float)
combined_df['mdn_length_m'] = combined_df['mdn_length_m'].astype(float)
combined_df['sd_length_m'] = combined_df['sd_length_m'].astype(float)
combined_df['m_points_no'] = combined_df['m_points_no'].astype(float)
combined_df['mdn_points_no'] = combined_df['mdn_points_no'].astype(float)
combined_df['sd_points_no'] = combined_df['sd_points_no'].astype(float)
combined_df['local_time'] = combined_df['local_time'].astype(str)

# Round float columns to 6 decimal places
float_columns = ['m_duration_min', 'mdn_duration_min', 'sd_duration_min',
                 'm_length_m', 'mdn_length_m', 'sd_length_m',
                 'm_points_no', 'mdn_points_no', 'sd_points_no']

combined_df[float_columns] = combined_df[float_columns].round(6)

combined_df = combined_df.drop_duplicates()

combined_df = combined_df.sort_values('local_time').reset_index(drop = True)
combined_df

## Check missing dates

In [None]:
unique_values_count = combined_df["local_date"].unique()
all_dates = pd.date_range(start='2019-01-01', end='2019-12-31').strftime('%Y%m%d').astype(int)
all_dates = np.array(all_dates)
missing_dates = np.setdiff1d(all_dates, unique_values_count)

missing_dates

## Check for hours per day (for 3h)

In [None]:
# Convert local_date to datetime objects
combined_df['local_date'] = pd.to_datetime(combined_df['local_date'], format='%Y%m%d')

# Function to parse the local_time column and extract the start hour
def parse_start_hour(time_str):
    try:
        start_time_str, _ = time_str.split(' - ')
        start_time = datetime.strptime(start_time_str, '%Y%m%d %H:%M:%S')
        return start_time.hour
    except ValueError:
        return None

combined_df['start_hour'] = combined_df['local_time'].map(parse_start_hour)

# Required hours to check
required_hours = set([0, 3, 6, 9, 12, 15, 18, 21])

# Group by local_date and check missing hours
missing_hours = {}

for date, group in combined_df.groupby('local_date'):
    present_hours = set(group['start_hour'])
    missing = required_hours - present_hours
    if missing:
        missing_hours[date] = sorted(list(missing))

# Convert missing hours dictionary to DataFrame for better readability
missing_hours_df = pd.DataFrame(list(missing_hours.items()), columns=['Date', 'Missing Hours'])

# Display the missing hours
print("Missing hours for each date:")
print(missing_hours_df)

## Check with figure

In [None]:
daily_counts = combined_df['local_date'].value_counts().sort_index()

# Generate custom xticks starting from the first date and then incrementing by 7 days
start_date = daily_counts.index.min()
end_date = daily_counts.index.max()
custom_xticks = pd.date_range(start=start_date, end=end_date, freq='7D')

# Plot the results
plt.figure(figsize=(10, 6))
daily_counts.plot(kind='bar')
plt.title('Number of Rows for Each Day')
plt.xlabel('Date')
plt.ylabel('Number of Rows')
plt.xticks(ticks=range(len(daily_counts)), labels=daily_counts.index.strftime('%Y-%m-%d'), rotation=90)
plt.gca().set_xticks([i for i, date in enumerate(daily_counts.index) if date in custom_xticks])
plt.tight_layout()
plt.show()

## Check Minimum and Maximum

In [None]:
max_15_counts = daily_counts.nlargest(50)
max_15_counts

In [None]:
min_5_counts = daily_counts.nsmallest(15)
min_5_counts

## Check for certain date  

In [None]:
# filtered_df = combined_df.loc[combined_df['local_date'] == 20190303]
combined_df.loc[combined_df['local_date'] == 20190606] 

## Clean up and Export

In [None]:
combined_df

In [None]:
combined_df = combined_df.drop(['start_hour'], axis=1)
combined_df['local_date'] = combined_df['local_date'].dt.strftime('%Y%m%d').astype(int)
combined_df = combined_df.sort_values('local_time').reset_index(drop=True)
combined_df

In [None]:
# Save the combined DataFrame to a new CSV file
folder_path = '/home/jovyan/Data/TJ/3h/od5_mx'
combined_df.to_csv(folder_path + '/od_mx_agg5_3h_final.csv', index=False)

## Get total number

In [None]:
country_code = 'CO'
year = 2019 

file_weekgh = f'/home/jovyan/Data/Week/combined/od_week_gh5_{country_code.lower()}_{year}_all.csv'
df_weekgh = pd.read_csv(file_weekgh)
df_weekgh['trip_count'].sum()

In [None]:
file_weekh3 = f'/home/jovyan/Data/Week/combined/od_week_h37_{country_code.lower()}_{year}_all.csv'
df_weekh3 = pd.read_csv(file_weekh3)
df_weekh3['trip_count'].sum()

In [None]:
file_monthh3 = f'/home/jovyan/Data/Month/combined/od_month_h37_{country_code.lower()}_{year}_all.csv'
df_monthh3 = pd.read_csv(file_monthh3)
df_monthh3['trip_count'].sum()

In [None]:
file_monthgh5 = f'/home/jovyan/Data/Month/combined/od_month_gh5_{country_code.lower()}_{year}_all.csv'
df_monthgh5 = pd.read_csv(file_monthgh5)
df_monthgh5['trip_count'].sum()

In [None]:
import os
import pandas as pd

In [None]:
base_path_month = '/home/jovyan/Data/Month/combined/'
data_month = []

In [None]:
# Walk through the directory and only consider .csv files
for root, dirs, files in os.walk(base_path_month):
    for file in files:
        if file.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Extract temporal resolution, spatial resolution, country code, and year from the file path
            temporal_resolution = file.split('_')[1]  # Extracted from file name, e.g., month
            spatial_resolution = file.split('_')[2]  # Extracted from file name, e.g., H37 or GH5
            country_code = file.split('_')[3]  # Extract country code from the file name
            year = file.split('_')[4]  # Extract year from the file name
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Calculate the total trip count
            total_trip_count = df['trip_count'].sum()
            
            # Append the results to the list
            data_month.append({
                'file_name': file,
                'trip_count': total_trip_count,
                'file_path': file_path,
                'temporal_resolution': temporal_resolution,
                'spatial_resolution': spatial_resolution,
                'country_code': country_code,
                'year': year
            })

# Convert the list to a DataFrame
result_df_month = pd.DataFrame(data_month)
result_df_month

In [None]:
base_path_week = '/home/jovyan/Data/Week/combined/'
data_week = []

In [None]:
# Walk through the directory and only consider .csv files
for root, dirs, files in os.walk(base_path_week):
    for file in files:
        if file.endswith('.csv'):
            # Construct the full file path
            file_path = os.path.join(root, file)
            
            # Extract temporal resolution, spatial resolution, country code, and year from the file path
            temporal_resolution = file.split('_')[1]  # Extracted from file name, e.g., month
            spatial_resolution = file.split('_')[2]  # Extracted from file name, e.g., H37 or GH5
            country_code = file.split('_')[3]  # Extract country code from the file name
            year = file.split('_')[4]  # Extract year from the file name
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Calculate the total trip count
            total_trip_count = df['trip_count'].sum()
            
            # Append the results to the list
            data_week.append({
                'file_name': file,
                'trip_count': total_trip_count,
                'file_path': file_path,
                'temporal_resolution': temporal_resolution,
                'spatial_resolution': spatial_resolution,
                'country_code': country_code,
                'year': year
            })

# Convert the list to a DataFrame
result_df_week = pd.DataFrame(data_week)
result_df_week

In [None]:
combined_df = pd.concat([result_df_week, result_df_month], ignore_index=True)
combined_df

In [None]:
combined_df.to_csv('/home/jovyan/Netmob24_datachallenge/Code/OD_location_wm.csv', index=False)