In [1]:
import io
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [5]:
# Define the path pattern to match all CO files in the directory
path_pattern = '/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_*.csv_kanonnimized.csv'
# CO, IN, ID

file_list = glob.glob(path_pattern)
file_list

['/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_nov.csv_kanonnimized.csv',
 '/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_dec.csv_kanonnimized.csv']

In [9]:
path_pattern = '/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_*.csv'
file_list = glob.glob(path_pattern)
file_sub_list = [path for path in file_list if not path.endswith('_kanonnimized.csv')]
file_sub_list

['/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_nov.csv',
 '/home/jovyan/Data/daily_pd5/CO_daily_agg5_hour_dec.csv']

In [11]:
# Initialize an empty list to hold DataFrames
dataframes = []

# Loop through the list of files and read each file into a DataFrame
for file in file_list:
    df = pd.read_csv(file)
    dataframes.append(df)
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,grt,day,month
0,d3gpz,593,105,0,1,11
1,d2g4x,85,16,0,1,11
2,d35vr,306,34,0,1,11
3,d2g6b,161,37,0,1,11
4,d347c,44,14,0,1,11
...,...,...,...,...,...,...
1781550,d2fbh,49,14,7,31,12
1781551,d2fjg,93,12,7,31,12
1781552,d3469,49,10,7,31,12
1781553,d3d7u,187,15,7,31,12


In [12]:
combined_df['no_of_points'].min()

1

In [13]:
combined_df['local_date'] = pd.to_datetime(combined_df[['month', 'day']].assign(year=2019).astype(str).apply('-'.join, 1)).dt.strftime('%Y%m%d').astype(int)
combined_df = combined_df.sort_values(by='local_date')
filtered_df = combined_df[combined_df['local_date'] > 20191031] # Only for CO

combined_df

Unnamed: 0,geohash_5,no_of_points,no_of_unique_users,grt,day,month,local_date
0,d3gpz,593,105,0,1,11,20191101
497626,d340r,6,2,4,1,11,20191101
497625,d36jt,1,1,4,1,11,20191101
497624,d6hv8,7,3,4,1,11,20191101
497623,d20vh,3,2,4,1,11,20191101
...,...,...,...,...,...,...,...
1500358,d23yq,11,2,5,31,12,20191231
1500359,d2fh8,1,1,5,31,12,20191231
1500360,d29u1,7,3,5,31,12,20191231
1500362,d2fmp,1,1,5,31,12,20191231


# Agg by day

In [15]:
pd_agg_day = combined_df.groupby(['geohash_5', 'local_date']).agg({
    'no_of_unique_users': 'sum',
    'no_of_points': 'sum'
}).reset_index()
pd_agg_day = pd_agg_day[pd_agg_day['no_of_unique_users'] >= 10]
pd_agg_day

Unnamed: 0,geohash_5,local_date,no_of_unique_users,no_of_points
20,6rfyf,20191101,147,1635
21,6rfyf,20191102,149,1759
22,6rfyf,20191103,115,1243
23,6rfyf,20191104,131,1663
24,6rfyf,20191105,113,1263
...,...,...,...,...
354985,d9052,20191108,11,39
354991,d9052,20191114,10,40
355013,d9052,20191206,11,46
355015,d9052,20191208,11,41


In [18]:
pd_agg_day[pd_agg_day['geohash_5'] == '6rfyf']

Unnamed: 0,geohash_5,local_date,no_of_unique_users,no_of_points
20,6rfyf,20191101,147,1635
21,6rfyf,20191102,149,1759
22,6rfyf,20191103,115,1243
23,6rfyf,20191104,131,1663
24,6rfyf,20191105,113,1263
...,...,...,...,...
76,6rfyf,20191227,40,376
77,6rfyf,20191228,36,366
78,6rfyf,20191229,31,317
79,6rfyf,20191230,39,448


In [19]:
# Save the combined DataFrame to a new CSV file
folder_path = '/home/jovyan/Data/daily_pd5/Cleaned/'
pd_agg_day.to_csv(folder_path + 'pd_co_2019_agg5.csv', index=False)

In [6]:
filtered_df = filtered_df.drop(columns=['grt', 'day', 'month'])
filtered_df

Unnamed: 0,geohash5,no_of_points,no_of_unique_users,local_date
0,d3h0k,148,22,20191101
33200,d2f4e,89,11,20191101
33199,d2d2q,43,11,20191101
33198,d2gk4,68,21,20191101
33197,d34ks,33,11,20191101
...,...,...,...,...
160480,d2g4w,39,18,20191231
160481,d3f7d,46,12,20191231
160482,d2f5h,108,22,20191231
160484,d35hf,92,15,20191231


In [10]:
filtered_df[filtered_df['geohash5'] == 'd2g4w']

Unnamed: 0,geohash5,no_of_points,no_of_unique_users,local_date
50728,d2g4w,100,36,20191101
16115,d2g4w,109,56,20191101
7938,d2g4w,52,17,20191101
32796,d2g4w,105,35,20191101
90205,d2g4w,107,43,20191101
...,...,...,...,...
174925,d2g4w,52,25,20191231
191392,d2g4w,50,22,20191231
146109,d2g4w,38,22,20191231
206656,d2g4w,22,12,20191231


In [7]:
# Save the combined DataFrame to a new CSV file
folder_path = '/home/jovyan/Data/daily_pd5/Cleaned/'
filtered_df.to_csv(folder_path + 'pd_CO_2019_agg5_daily.csv', index=False)