In [8]:
import json
import numpy as np
import pandas as pd
import time

In [9]:
def from_jsonl_to_df(path):
    t_start = time.time()
    with open(path, 'r') as json_file:
        json_list = list(json_file)
    json_list = [json.loads(json_str) for json_str in json_list]
    df = pd.DataFrame.from_dict(json_list, orient='columns')
    
    print("DataFrame of {}".format(len(df))+" rows loaded in {:.2f} sec".format(time.time()-t_start))
    return df

def from_csv_to_df(path):
    t_start=time.time()
    if "followers" in path:
        df=pd.read_csv(path, names=["follower", "person followed", "timestamp"], header=None)
    else:
        df=pd.read_csv(path)
    print("DataFrame of {}".format(len(df))+" rows loaded in {:.2f} sec".format(time.time()-t_start))
    return df

In [10]:
df_keaks = from_jsonl_to_df("data/full/keaks.jsonl")

DataFrame of 274472 rows loaded in 5.10 sec


In [11]:
df_beats =from_jsonl_to_df("data/full/beats.jsonl")

DataFrame of 59761 rows loaded in 1.79 sec


In [12]:
df_users =from_jsonl_to_df("data/full/users.jsonl")

DataFrame of 1162572 rows loaded in 36.61 sec


In [13]:
df_followers = from_csv_to_df("data/full/followers.csv")

DataFrame of 10506442 rows loaded in 13.61 sec


In [14]:
df_audiences = from_csv_to_df("data/full/audiences.csv")

DataFrame of 28952275 rows loaded in 38.06 sec


In [8]:
df_keaks.head(2)

Unnamed: 0,keakId,createdAt,likeCount,commentCount,viewCount,averageViewProgress,duration,hashtags,contentType,hasSmallThumbnail,lien
0,17301813623064450175,2019-10-10T21:20:59.2970322Z,4,2,42,7.8,107.0,"[91, 1, Rap, Pen, freestyle2019, Trap2K19]",freestyle,True,https://www.keakr.com/fr/keak/mon-mec-rap-1
1,6202649352,2018-04-02T15:06:34.9851924Z,2,2,53,0.0,68.0,[],freestyle,True,https://www.keakr.com/fr/keak/petit-salaire


In [9]:
df_audiences.head(2)

Unnamed: 0,userId,contentId,timestamp,progress,liked,commented,shared
0,users/6512051967,keaks/17301813623657913783,2020-01-01T00:00:00.4081666,0,True,True,True
1,users/17301813623701852659,keaks/17301813623464753700,2020-01-01T00:00:00.8245932,0,False,False,False


In [10]:
df_followers.head(2)

Unnamed: 0,follower,person followed,timestamp
0,users/17301813624860662494,users/17301813625195354624,2020-06-13T16:53:19.7077441Z
1,users/11928492392,users/12680019689,2019-04-11T22:23:51.9010202Z


In [11]:
df_beats.head(2)

Unnamed: 0,beatId,genres,moods,nbKeaks,nbLikes,beatmakerId,duration,bpm,createdAt,updatedAt,link,licenceType
0,17301813628927249101,"[{'id': '9920897543', 'name': 'Trap'}]","[{'id': '17301813622132424287', 'name': 'Dark'...",7,4,17301813625134492069,121.0,102.0,2021-09-20T18:16:47.1420645Z,2021-09-20T18:17:00.2684235Z,https://keakr.com/fr/beat/turquoiz,[free]
1,17301813627622625982,"[{'id': '9920897543', 'name': 'Trap'}]",[],1,5,17301813627622569830,163.0,125.0,2021-03-31T15:20:10.726762Z,2021-03-31T15:20:26.5760601Z,https://keakr.com/fr/beat/moula-i,[free]


In [12]:
df_users.head(10)

Unnamed: 0,userId,createdAt,lastConnection,usedGenres,listenedGenres,battleCreatedCount,battleLostCount,battleRespondedCount,battleWonCount,friendCount,...,mutualFollowCount,overallBeatUsage,PlaylistCount,prizeMoneyParticipationCount,prizeMoneyWinner,sessionCount,shareCount,viewCount,isBeatmaker,isSinger
0,12354401148,2019-03-02T10:23:33.0903001Z,2019-03-02T10:23:44.3283191Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,2.0,,0,False,False
1,12354411487,2019-03-02T10:26:23.9016307Z,2020-09-24T06:16:46.8631984Z,[],[],0,0,0,0,0,...,,0.0,,0,False,8.0,,0,False,False
2,12354483185,2019-03-02T10:28:49.5056384Z,2019-03-04T17:39:14.6525393Z,[],[],0,0,0,0,0,...,,0.0,,0,False,3.0,,0,False,False
3,12354483249,2019-03-02T10:28:51.4548726Z,2019-03-03T12:52:52.7709452Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,4.0,,0,False,False
4,12354656710,2019-03-02T10:35:40.0733875Z,2020-05-05T09:33:54.9408989Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,20.0,,0,False,False
5,12354719690,2019-03-02T10:38:49.0256284Z,2019-03-15T17:02:08.5038717Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,3.0,,0,False,False
6,12354776855,2019-03-02T10:41:49.8917913Z,2019-03-02T10:41:49.8917913Z,[],[],0,0,0,0,0,...,0.0,0.0,,0,False,7.0,,0,False,False
7,12354798692,2019-03-02T10:44:13.0838926Z,2019-03-02T10:44:33.1834778Z,[],[],0,0,0,0,0,...,,0.0,,0,False,2.0,,0,False,False
8,12354818744,2019-03-02T10:44:46.6556384Z,2019-03-02T10:45:07.4571885Z,[],[],0,0,0,0,0,...,,0.0,,0,False,2.0,,0,False,False
9,12354842348,2019-03-02T10:47:49.2815579Z,2019-05-20T13:32:52.3574973Z,[],[],0,0,0,0,0,...,,0.0,,0,False,16.0,,0,False,False


In [13]:
list(df_users.columns)

['userId',
 'createdAt',
 'lastConnection',
 'usedGenres',
 'listenedGenres',
 'battleCreatedCount',
 'battleLostCount',
 'battleRespondedCount',
 'battleWonCount',
 'friendCount',
 'keakCount',
 'keakrCoinGiven',
 'keakCoinReceived',
 'likeCount',
 'likeGivenCount',
 'mutualFollowCount',
 'overallBeatUsage',
 'PlaylistCount',
 'prizeMoneyParticipationCount',
 'prizeMoneyWinner',
 'sessionCount',
 'shareCount',
 'viewCount',
 'isBeatmaker',
 'isSinger']

In [14]:
pip install ipynb

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/lab/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
from ipynb.fs.full.ArangoDB_connection import *

2
[{'id': '17301813627868798088', 'createdAt': '2021-04-27T14:29:55.7851529Z', 'lastConnectionDate': '2022-02-09T09:34:27.1083572Z', 'usedGenres': [{'genreId': '12468689066', 'count': 2}, {'genreId': '17301813621692872888', 'count': 31}, {'genreId': '17301813621692872889', 'count': 3}, {'genreId': '17301813621692872891', 'count': 3}, {'genreId': '17301813621692872893', 'count': 1}, {'genreId': '17301813621692872894', 'count': 9}, {'genreId': '17301813621692872896', 'count': 2}, {'genreId': '17301813623043473093', 'count': 60}, {'genreId': '9920897514', 'count': 7}, {'genreId': '9920897516', 'count': 3}, {'genreId': '9920897517', 'count': 16}, {'genreId': '9920897519', 'count': 17}, {'genreId': '9920897520', 'count': 2}, {'genreId': '9920897521', 'count': 2}, {'genreId': '9920897522', 'count': 5}, {'genreId': '9920897523', 'count': 4}, {'genreId': '9920897524', 'count': 2}, {'genreId': '9920897525', 'count': 8}, {'genreId': '9920897526', 'count': 2}, {'genreId': '9920897527', 'count': 2

In [None]:
df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users['lastConnection'][c]=df_users["lastConnection"][c][0:10]


In [None]:
df_users.columns

In [None]:
import datetime

In [None]:
essais_date=df_users['lastConnection'][0]

In [None]:

date=datetime.date(essais_date)

In [None]:
essais_date

In [None]:
essais_date[0:10]

In [None]:
df_users['lastConnection'][1]

In [24]:
yo=pd.DataFrame([str(i) for i in range(1,1000)] )

In [26]:
yo.columns=['yoyo']

In [33]:
yo['yoyo'].apply(lambda s :s+'a')

0        1a
1        2a
2        3a
3        4a
4        5a
       ... 
994    995a
995    996a
996    997a
997    998a
998    999a
Name: yoyo, Length: 999, dtype: object

In [34]:
yo['yoyo'].apply(lambda s : s[0])

0      1
1      2
2      3
3      4
4      5
      ..
994    9
995    9
996    9
997    9
998    9
Name: yoyo, Length: 999, dtype: object

In [49]:
def embed_keakr(keak):

#     embedding = []

#     embedding.append(keak["likeCount"])

#     embedding.append(keak["commentCount"])

#     embedding.append(keak["viewCount"])

#     embedding.append(keak["averageViewProgress"])

#     embedding.append(keak["duration"])

#     embedding.append(len(keak["hashtags"]))

#     thumbnail = 1 if keak["hasSmallThumbnail"] is True else 0

#     embedding.append(thumbnail)
    embedding=
    
    

   

    return embedding

In [None]:
le = preprocessing.LabelEncoder()
le.fit(data["result"])
le.transform(data["result"])

In [75]:
df_keaks.tail()

Unnamed: 0,keakId,createdAt,likeCount,commentCount,viewCount,averageViewProgress,duration,hashtags,contentType,hasSmallThumbnail,lien,Scale
274467,17301813629989572608,2022-02-09T09:17:51.7692448Z,0,0,0,,,[],live,,https://www.keakr.com/fr/keak/table-de-mixage,2
274468,17301813627349920654,2021-02-27T19:44:20.6015776Z,0,0,1,,,[],live,,https://www.keakr.com/fr/keak/vous-me-donnez--k,2
274469,17301813628032088576,2021-05-20T15:57:05.1879805Z,0,0,0,,,"[FREESTYLE, MW12]",live,,https://www.keakr.com/fr/keak/-freestyle-mw12-x,2
274470,17301813627629506623,2021-04-01T10:25:47.1498387Z,0,0,6,0.0,,[],live,,https://www.keakr.com/fr/keak/mes-menena,2
274471,17301813628020986588,2021-05-18T22:15:42.9203742Z,1,2,8,0.0,,[],live,,https://www.keakr.com/fr/keak/blacc-rain-bar,2


In [78]:
df_keaks = df_keaks.set_index('keakId') 

In [139]:

le = preprocessing.LabelEncoder()
def embed_keaks(keak_id):
    le = preprocessing.LabelEncoder()
    df=df_keaks.copy()
    df = df.set_index('keakId') 
    le.fit(df["contentType"])
    df=pd.get_dummies(df,columns=['hasSmallThumbnail'])
    df["contentTypeEnc"]=le.transform(df["contentType"])

    keak=df.loc[keak_id]
    return list(keak[['likeCount',"commentCount","viewCount","averageViewProgress","duration","contentTypeEnc","hasSmallThumbnail_True"]])
    

In [140]:
embed_keaks('11779960378')

[1, 1, 55, 0, 6.0, 2, 1]

In [91]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df=df_keaks.copy()
df = df.set_index('keakId') 
le.fit(df["contentType"])
le.transform(df["contentType"])

array([2, 2, 4, ..., 4, 4, 4])

In [90]:
df_keaks.columns

Index(['keakId', 'createdAt', 'likeCount', 'commentCount', 'viewCount',
       'averageViewProgress', 'duration', 'hashtags', 'contentType',
       'hasSmallThumbnail', 'link', 'beatId', 'beatGenres'],
      dtype='object')

In [96]:
df.loc['11779960378']

createdAt                     2019-01-18T21:36:38.7535079Z
likeCount                                                1
commentCount                                             1
viewCount                                               55
averageViewProgress                                      0
duration                                               6.0
hashtags                                                []
contentType                                      freestyle
hasSmallThumbnail                                     True
link                   https://www.keakr.com/fr/keak/m-s-s
beatId                                                None
beatGenres                                            None
Name: 11779960378, dtype: object

In [95]:
df

Unnamed: 0_level_0,createdAt,likeCount,commentCount,viewCount,averageViewProgress,duration,hashtags,contentType,hasSmallThumbnail,link,beatId,beatGenres
keakId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
17301813623064450175,2019-10-10T21:20:59.2970322Z,4,2,42,7.80,107.0,"[91, 1, Rap, Pen, freestyle2019, Trap2K19]",freestyle,True,https://www.keakr.com/fr/keak/mon-mec-rap-1,17301813622843862243,"[{'id': '9920897543', 'name': 'Trap'}]"
6202649352,2018-04-02T15:06:34.9851924Z,2,2,53,0.00,68.0,[],freestyle,True,https://www.keakr.com/fr/keak/petit-salaire,,
17301813626763053961,2020-11-25T03:48:11.7751856Z,0,1,0,,,[],live,,https://www.keakr.com/fr/keak/live-1po,,
11779960378,2019-01-18T21:36:38.7535079Z,1,1,55,0,6.0,[],freestyle,True,https://www.keakr.com/fr/keak/m-s-s,,
17301813625762381409,2020-07-30T23:37:29.8512735Z,16,2,100,1.04,185.0,[],freestyle,True,https://www.keakr.com/fr/keak/mes-inspiration,17301813625761964386,"[{'id': '9920897519', 'name': 'Boom bap'}, {'i..."
...,...,...,...,...,...,...,...,...,...,...,...,...
17301813629989572608,2022-02-09T09:17:51.7692448Z,0,0,0,,,[],live,,https://www.keakr.com/fr/keak/table-de-mixage,,
17301813627349920654,2021-02-27T19:44:20.6015776Z,0,0,1,,,[],live,,https://www.keakr.com/fr/keak/vous-me-donnez--k,,
17301813628032088576,2021-05-20T15:57:05.1879805Z,0,0,0,,,"[FREESTYLE, MW12]",live,,https://www.keakr.com/fr/keak/-freestyle-mw12-x,,
17301813627629506623,2021-04-01T10:25:47.1498387Z,0,0,6,0.00,,[],live,,https://www.keakr.com/fr/keak/mes-menena,,


In [129]:
lb = preprocessing.LabelBinarizer()
lb.fit([True,False])

LabelBinarizer()

In [130]:
lb.transform(True)

ValueError: Expected array-like (array or non-string sequence), got True

In [1]:
def embed_keaks(keak_id):
    le = preprocessing.LabelEncoder()
    le = preprocessing.LabelEncoder()
    df=df_keaks.copy()
    df = df.set_index('keakId') 
    le.fit(df["contentType"])
    df=pd.get_dummies(df,columns=['hasSmallThumbnail'])
    df["contentTypeEnc"]=le.transform(df["contentType"])

    keak=df.loc[keak_id]
    return list(keak[['likeCount',"commentCount","viewCount","averageViewProgress","duration","contentTypeEnc","hasSmallThumbnail_True"]])
    

In [151]:
df_users.columns

Index(['userId', 'createdAt', 'lastConnection', 'usedGenres', 'listenedGenres',
       'battleCreatedCount', 'battleLostCount', 'battleRespondedCount',
       'battleWonCount', 'friendCount', 'keakCount', 'keakrCoinGiven',
       'keakCoinReceived', 'likeCount', 'likeGivenCount', 'mutualFollowCount',
       'overallBeatUsage', 'PlaylistCount', 'prizeMoneyParticipationCount',
       'prizeMoneyWinner', 'sessionCount', 'shareCount', 'viewCount',
       'isBeatmaker', 'isSinger'],
      dtype='object')

In [2]:
df_users = df_users.set_index('userId')

def embed_user(user_row):
    userId = user_row["userId"]
#     df=df_users.copy()
#     df = df.set_index('userId')

    user=df_users.loc[df_users["userId"]]
    return list(user["battleCreatedCount","battleLostCount","battleRespondedCount","battleWonCount","keakCoinReceived","likeCount",'battleWonCount', 'friendCount', 'keakCount', 'keakrCoinGiven',
       'keakCoinReceived', 'likeCount', 'likeGivenCount', 'mutualFollowCount',
       'overallBeatUsage', 'PlaylistCount', 'prizeMoneyParticipationCount',
       'prizeMoneyWinner', 'sessionCount', 'shareCount', 'viewCount'])
    

NameError: name 'df_users' is not defined

In [153]:
df_audiences

Unnamed: 0,userId,contentId,timestamp,progress,liked,commented,shared
0,users/6512051967,keaks/17301813623657913783,2020-01-01T00:00:00.4081666,0,True,True,True
1,users/17301813623701852659,keaks/17301813623464753700,2020-01-01T00:00:00.8245932,0,False,False,False
2,users/6512051967,keaks/17301813623650830605,2020-01-01T00:00:05.8421207,0,True,True,True
3,users/17301813623705778269,keaks/6102616636,2020-01-01T00:00:06.9566707,0,False,False,False
4,users/17301813623705771853,keaks/17301813622897398300,2020-01-01T00:00:08.3291103,0,False,False,False
...,...,...,...,...,...,...,...
28952270,users/17301813629487711641,keaks/17301813629989180861,2022-02-09T09:21:46.2134495,0,False,True,True
28952271,users/17301813622441019504,keaks/11770037098,2022-02-09T09:21:52.7335623,0,False,False,False
28952272,users/17301813621633239466,keaks/17301813621722253695,2022-02-09T09:22:19.5764394,0,False,True,True
28952273,users/17301813622441019504,keaks/17301813629879180579,2022-02-09T09:22:23.6742138,0,False,False,False


In [157]:
def convert_df_to_np(df_audiences, df_users, df_keaks):
    user_embed = df_users.apply(embed_user, axis=1)
    keaks_embed = df_keaks.apply(embed_keaks, axis=1)
    
    

In [None]:
user_embed=df_users.apply(embed_user, axis=1)

In [3]:
arg_list = ['userId',
 'battleCreatedCount',
 'battleLostCount',
 'battleRespondedCount',
 'battleWonCount',
 'friendCount',
 'keakCount',
 'keakrCoinGiven',
 'keakCoinReceived',
 'likeCount',
 'likeGivenCount',
 'mutualFollowCount',
 'overallBeatUsage',
 'PlaylistCount',
 'prizeMoneyParticipationCount',
 'prizeMoneyWinner',
 'sessionCount',
 'shareCount',
 'viewCount',
 'isBeatmaker',
 'isSinger']
def df_user_embbed(df_user, arg_list):
     return (df_user[arg_list].set_index('userId')
                        .fillna(0)
                        .replace({False: 0, True: 1})
                        .apply(lambda x: x/x.max(), axis=0)
                        .fillna(0)
            )

In [15]:
df_user_embbed(df_users, arg_list)

Unnamed: 0_level_0,battleCreatedCount,battleLostCount,battleRespondedCount,battleWonCount,friendCount,keakCount,keakrCoinGiven,keakCoinReceived,likeCount,likeGivenCount,mutualFollowCount,overallBeatUsage,PlaylistCount,prizeMoneyParticipationCount,prizeMoneyWinner,sessionCount,shareCount,viewCount,isBeatmaker,isSinger
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
12354401148,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000111,0.000000,0.0,0.000000,0.0,0.0,0.000035,0.0,0.000000,0.0,0.0
12354411487,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000139,0.0,0.000000,0.0,0.0
12354483185,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000052,0.0,0.000000,0.0,0.0
12354483249,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000069,0.0,0.000000,0.0,0.0
12354656710,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000134,0.000000,0.0,0.000000,0.0,0.0,0.000347,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12354325231,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000069,0.0,0.000000,0.0,0.0
12354349111,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000022,0.000000,0.0,0.001597,0.0,0.0,0.000069,0.0,0.000000,0.0,0.0
12354353801,0.0,0.008,0.020833,0.0,0.000058,0.0,0.0,0.0,0.0,0.000000,0.000026,0.0,0.000000,0.0,0.0,0.000329,0.0,0.000171,0.0,0.0
12354355337,0.0,0.000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000045,0.000000,0.0,0.000000,0.0,0.0,0.000191,0.0,0.000000,0.0,0.0
