In [4]:
import json
import numpy as np
import pandas as pd
import time

In [5]:
def from_jsonl_to_df(path):
    t_start = time.time()
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    json_list = [json.loads(json_str) for json_str in json_list]
    df = pd.DataFrame.from_dict(json_list, orient='columns')
    
    print("DataFrame of {}".format(len(df))+" rows loaded in {:.2f} sec".format(time.time()-t_start))
    return df

def from_csv_to_df(path):
    t_start=time.time()
    if "followers" in path:
        df=pd.read_csv(path, names=["follower", "person followed", "timestamp"], header=None)
    else:
        df=pd.read_csv(path)
    print("DataFrame of {}".format(len(df))+" rows loaded in {:.2f} sec".format(time.time()-t_start))
    return df

In [6]:
df_keaks = from_jsonl_to_df("data/full/keaks.jsonl")

DataFrame of 274472 rows loaded in 4.91 sec


In [7]:
df_beats =from_jsonl_to_df("data/full/beats.jsonl")

DataFrame of 59761 rows loaded in 1.31 sec


In [8]:
df_users =from_jsonl_to_df("data/full/users.jsonl")

DataFrame of 1162572 rows loaded in 34.25 sec


In [9]:
df_followers = from_csv_to_df("data/full/followers.csv")

DataFrame of 10506442 rows loaded in 13.36 sec


In [10]:
df_audiences = from_csv_to_df("data/full/audiences.csv")

DataFrame of 28952275 rows loaded in 35.94 sec


In [11]:
df_keaks.head(2)list(df_keaks.columns)

Unnamed: 0,keakId,createdAt,likeCount,commentCount,viewCount,averageViewProgress,duration,hashtags,contentType,hasSmallThumbnail,lien
0,17301813623064450175,2019-10-10T21:20:59.2970322Z,4,2,42,7.8,107.0,"[91, 1, Rap, Pen, freestyle2019, Trap2K19]",freestyle,True,https://www.keakr.com/fr/keak/mon-mec-rap-1
1,6202649352,2018-04-02T15:06:34.9851924Z,2,2,53,0.0,68.0,[],freestyle,True,https://www.keakr.com/fr/keak/petit-salaire


In [12]:
df_audiences.head(2)

Unnamed: 0,userId,contentId,timestamp,progress,liked,commented,shared
0,users/6512051967,keaks/17301813623657913783,2020-01-01T00:00:00.4081666,0,True,True,True
1,users/17301813623701852659,keaks/17301813623464753700,2020-01-01T00:00:00.8245932,0,False,False,False


In [13]:
df_followers.head(2)

Unnamed: 0,follower,person followed,timestamp
0,users/17301813624860662494,users/17301813625195354624,2020-06-13T16:53:19.7077441Z
1,users/11928492392,users/12680019689,2019-04-11T22:23:51.9010202Z


In [14]:
df_beats.head(2)

Unnamed: 0,beatId,genres,moods,nbKeaks,nbLikes,beatmakerId,duration,bpm,createdAt,updatedAt,link,licenceType
0,17301813628927249101,"[{'id': '9920897543', 'name': 'Trap'}]","[{'id': '17301813622132424287', 'name': 'Dark'...",7,4,17301813625134492069,121.0,102.0,2021-09-20T18:16:47.1420645Z,2021-09-20T18:17:00.2684235Z,https://keakr.com/fr/beat/turquoiz,[free]
1,17301813627622625982,"[{'id': '9920897543', 'name': 'Trap'}]",[],1,5,17301813627622569830,163.0,125.0,2021-03-31T15:20:10.726762Z,2021-03-31T15:20:26.5760601Z,https://keakr.com/fr/beat/moula-i,[free]


In [15]:
df_users.head(2)

Unnamed: 0,userId,createdAt,lastConnection,usedGenres,listenedGenres,battleCreatedCount,battleLostCount,battleRespondedCount,battleWonCount,friendCount,...,mutualFollowCount,overallBeatUsage,PlaylistCount,prizeMoneyParticipationCount,prizeMoneyWinner,sessionCount,shareCount,viewCount,isBeatmaker,isSinger
0,12354401148,2019-03-02T10:23:33.0903001Z,2019-03-02T10:23:44.3283191Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,2.0,,0,False,False
1,12354411487,2019-03-02T10:26:23.9016307Z,2020-09-24T06:16:46.8631984Z,[],[],0,0,0,0,0,...,,0.0,,0,False,8.0,,0,False,False


In [17]:
list(df_users.columns)

['userId',
 'createdAt',
 'lastConnection',
 'usedGenres',
 'listenedGenres',
 'battleCreatedCount',
 'battleLostCount',
 'battleRespondedCount',
 'battleWonCount',
 'friendCount',
 'keakCount',
 'keakrCoinGiven',
 'keakCoinReceived',
 'likeCount',
 'likeGivenCount',
 'mutualFollowCount',
 'overallBeatUsage',
 'PlaylistCount',
 'prizeMoneyParticipationCount',
 'prizeMoneyWinner',
 'sessionCount',
 'shareCount',
 'viewCount',
 'isBeatmaker',
 'isSinger']