In [1]:
import pandas as pd
import time
from glob import glob
from os.path import basename
from os import cpu_count
from itertools import islice
from multiprocessing import Pool
import logging
from datetime import datetime
import uuid


logging.getLogger().setLevel(logging.INFO)


def current_time(p: bool = True):
    now = datetime.now()
    if p:
        time = now.strftime("%Y-%m-%d %H:%M:%S")
    else:
        time = now.strftime("%Y%m%d%H%M%S")

    return time


def splitDict(d, num_of_cpus):
    # split dict evenly by number of CPUs
    
    lists = []
    n = len(d) // num_of_cpus
    i = iter(d.items())
    for x in range(num_of_cpus):
        d = dict(islice(i, n))
        lists.append(d)
        
    return lists


def proc_df(users: dict):
    # create sub-dataframe by given data
    
    praty_id = dict()
    messages = pd.DataFrame(columns=['timestamp', 'unix-timestamp', 'hour', 'from', 'to', 'to_num', 'location'])
    
    for uid in users:
        for ts in users[uid]:
            t = datetime.strptime(ts, "%Y-%m-%d %H:%M:%S")
            
            unix_time = int(time.mktime(t.timetuple()))
            flipped = {}
            for key, value in users[uid][ts].items():
                if value not in flipped:
                    flipped[value] = [key]
                else:
                    flipped[value].append(key)
            
            for loc in flipped:
                messages.loc[len(messages.index)] = [ts, unix_time, t.hour, uid, flipped[loc], len(flipped[loc]), loc]
    
    return messages


def create_df(file: str):
    logging.info(f"[{file}] started at: {current_time()}")
    # create dataframe
    
    messages = pd.DataFrame(columns=['timestamp', 'unix-timestamp', 'hour','from', 'to', 'to_num', 'location'])
    fri = pd.read_csv(file, index_col=0)
    users = dict()

    for ts, row in fri.iterrows():
        msg_from = str(row[0])
        msg_to = row[1]
        loc = row[2]

        if msg_from not in users:
            users[msg_from] = dict()
        if ts not in users[msg_from]:
            users[msg_from][ts] = dict()

        users[msg_from][ts][msg_to] = loc
        
    # multiprocessing data entries
    # num_of_cpus = 1
    num_of_cpus = cpu_count() - 1
    users_list = splitDict(users, num_of_cpus)
    
    with Pool(processes=num_of_cpus) as pool:
        workers = list()
        for user_item in users_list:
            workers.append(pool.apply_async(proc_df, (user_item, )))
    
        for w in workers:
            try:
                messages = pd.concat([messages, w.get()])
            except Exception as e:
                logging.error(e)
    
        pool.close()
        pool.join()
    
    messages = messages.sort_values(by=['unix-timestamp'])
    messages = messages.reset_index(drop=True)

    # output dataframes to json
    output_name = basename(file).replace("csv", "json")
    messages.to_csv(f"./outputs/{basename(file)}")
    messages.to_json(f"./outputs/{output_name}")
    
    logging.info(f"[{file}] completed at: {current_time()}")
    
    return messages

In [2]:
fri = create_df("./dataset/mc2_2015_data/comm-data-Fri.csv")
sat = create_df("./dataset/mc2_2015_data/comm-data-Sat.csv")
sun = create_df("./dataset/mc2_2015_data/comm-data-Sun.csv")

INFO:root:[./dataset/mc2_2015_data/comm-data-Fri.csv] started at: 2022-12-10 22:31:50
INFO:root:[./dataset/mc2_2015_data/comm-data-Fri.csv] completed at: 2022-12-10 22:32:47
INFO:root:[./dataset/mc2_2015_data/comm-data-Sat.csv] started at: 2022-12-10 22:32:47
INFO:root:[./dataset/mc2_2015_data/comm-data-Sat.csv] completed at: 2022-12-10 22:34:32
INFO:root:[./dataset/mc2_2015_data/comm-data-Sun.csv] started at: 2022-12-10 22:34:32
INFO:root:[./dataset/mc2_2015_data/comm-data-Sun.csv] completed at: 2022-12-10 22:36:35


In [3]:
fri.head(20)

Unnamed: 0,timestamp,unix-timestamp,hour,from,to,to_num,location
0,2014-6-06 08:03:19,1402056199,8,439105,"[1053224, 1696241, 580064, 1464748]",4,Kiddie Land
1,2014-6-06 08:03:47,1402056227,8,1836139,[1593258],1,Entry Corridor
2,2014-6-06 08:04:06,1402056246,8,1464748,"[439105, 1053224, 1696241, 580064]",4,Kiddie Land
3,2014-6-06 08:04:22,1402056262,8,580064,"[439105, 1053224, 1696241, 1464748]",4,Kiddie Land
4,2014-6-06 08:04:33,1402056273,8,1053224,"[439105, 1696241, 580064, 1464748]",4,Kiddie Land
5,2014-6-06 08:05:05,1402056305,8,1593258,[1836139],1,Entry Corridor
6,2014-6-06 08:05:11,1402056311,8,612957,[160360],1,Tundra Land
7,2014-6-06 08:05:18,1402056318,8,494296,[external],1,Kiddie Land
8,2014-6-06 08:05:27,1402056327,8,1053224,"[580064, 439105, 1696241, 1464748]",4,Kiddie Land
9,2014-6-06 08:05:46,1402056346,8,1413244,[839736],1,Tundra Land
