In [4]:
#!pip install pyarrow
#!pip install pyspark
#!pip install pandas

In [5]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
#os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'
from pyspark import SparkContext, SparkConf
from pyspark.pandas import read_csv
import pyspark.pandas as ps
import pandas as pd
# means is for items, df is for collab

In [None]:
sc = SparkContext.getOrCreate()
sc.setCheckpointDir('/alscp')
sc.getConf().getAll()

In [6]:
ps.set_option('compute.default_index_type', 'distributed')
big_data = False
path ='data/100k_a.csv' if  not big_data else 'data/full_a.csv'
df = read_csv(path, names=['userId','streamId','streamerName','timeStart','timeStop'])

In [7]:
df.head()

Unnamed: 0,userId,streamId,streamerName,timeStart,timeStop
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [8]:
df['interactionTime']=(df['timeStop'] - df['timeStart'])*10 # units are 10 min

In [9]:
df.head()

Unnamed: 0,userId,streamId,streamerName,timeStart,timeStop,interactionTime
0,1,33842865744,mithrain,154,156,20
1,1,33846768288,alptv,166,169,30
2,1,33886469056,mithrain,587,588,10
3,1,33887624992,wtcn,589,591,20
4,1,33890145056,jrokezftw,591,594,30


In [10]:
df= df.drop(columns=['timeStart','timeStop'])

In [11]:
tmax=df['interactionTime'].max()

In [12]:
tmin=df['interactionTime'].min()

In [13]:
df['interactionTime']= (df['interactionTime']- tmin)/(tmax-tmin)

In [14]:
df.head()

Unnamed: 0,userId,streamId,streamerName,interactionTime
0,1,33842865744,mithrain,0.010417
1,1,33846768288,alptv,0.020833
2,1,33886469056,mithrain,0.0
3,1,33887624992,wtcn,0.010417
4,1,33890145056,jrokezftw,0.020833


In [15]:
# goal is to scale to 1-100 for implict rating/confidence
# MAYBE INVESTIGATE DIFFERENT SCALING IN FUTURE
df['interactionTime'] = (df['interactionTime'] * 99) +1

In [16]:
df.head()

Unnamed: 0,userId,streamId,streamerName,interactionTime
0,1,33842865744,mithrain,2.03125
1,1,33846768288,alptv,3.0625
2,1,33886469056,mithrain,1.0
3,1,33887624992,wtcn,2.03125
4,1,33890145056,jrokezftw,3.0625


In [17]:
# Create dictionary for streamer names
streamer_dict = {k: v for v, k in enumerate(df['streamerName'].to_numpy())}
streamer_dict

{'mithrain': 3047137,
 'alptv': 3038334,
 'wtcn': 3046060,
 'jrokezftw': 3050226,
 'berkriptepe': 3038141,
 'kendinemuzisyen': 3050209,
 'unlostv': 3038188,
 'zeon': 3038868,
 'elraenn': 3050215,
 'jahrein': 3050220,
 'raufbaba25': 3038268,
 'ogrencievi': 3050201,
 'eraymaskulen': 2975844,
 'zeusidiouss': 3050222,
 'h3x_tv': 2915320,
 'towshun': 2995593,
 'esl_csgo': 3051605,
 'grimnax': 3050213,
 'jtgtv': 3047130,
 'bumblebee_kr': 3049965,
 'hanryang1125': 3047670,
 'wan6491': 3008380,
 'chfhdtpgus1': 3049933,
 'sal_gu': 3049950,
 'lol_ambition': 3051404,
 'dmdtkadl69': 3050011,
 'rechotz': 2626472,
 'lol_madlife': 3049999,
 'nanajam777': 3048830,
 'tankergm': 2496184,
 'rngudwnswkd': 3027613,
 'kimhj1478': 2672689,
 'goemdawon': 2722917,
 'leehunnyeo': 3050614,
 'grma1717': 2861349,
 'megthomatho': 2986508,
 'klugee': 87,
 'mrscheff': 2695851,
 'quickybaby': 3038821,
 'tankelit': 2568133,
 'skill4ltu': 3038823,
 'kajzoo': 1815230,
 'wearethevr': 3050546,
 'dakillzor': 3017071,
 'mrsa

In [18]:
# Mapping function for streamer name to their respective ID
def tuple_to_value(x):
    return streamer_dict.get(x)

In [19]:
# Create streamerID column based on dictionary mapping
df['streamerId']=df['streamerName'].apply(lambda x: tuple_to_value(x)) # map streamerId

In [20]:
df.head()

Unnamed: 0,userId,streamId,streamerName,interactionTime,streamerId
0,1,33842865744,mithrain,2.03125,3047137
1,1,33846768288,alptv,3.0625,3038334
2,1,33886469056,mithrain,1.0,3047137
3,1,33887624992,wtcn,2.03125,3046060
4,1,33890145056,jrokezftw,3.0625,3050226


In [21]:
# Group by streamer ID and user ID and get the average interaction time for that pair
df_group = df.groupby(['streamerId', 'userId'])["interactionTime"].mean()

In [22]:
# Reset index so that can save to csv properly
df_reset = df_group.reset_index()

In [23]:
df_reset.head()

Unnamed: 0,streamerId,userId,interactionTime
0,641416,86,1.0
1,3050449,94,14.921875
2,2469,98,1.0
3,3045251,99,5.125
4,3045744,110,1.0


In [24]:
len(df['streamerId'].unique()) == len(df['streamerName'].unique())

True

In [25]:
# Save the new averaged data
df_reset.to_spark().coalesce(1).write.format('csv').mode("overwrite").option('header', 'true').save('data/collab')

In [26]:
df = df.drop(columns=['streamerName'])

In [27]:
means=df.groupby(['streamerId'],as_index=False)['interactionTime'].mean()
means=means.rename(columns={'streamerId':'streamerId','interactionTime':'avgInteractionTime'})

In [28]:
means.head()

Unnamed: 0,streamerId,avgInteractionTime
0,2987163,3.338468
1,2751748,2.243566
2,3048233,2.03125
3,3051309,1.709515
4,1950,1.0


In [29]:
numStreams = df.groupby(['streamerId'],as_index=False).size()
numStreams.max()

45144

In [30]:
means=means.join(numStreams,on='streamerId')

In [31]:
means= means.rename(columns={'streamerId':'streamerId','avgInteractionTime':'avgInteractionTime', 0:'interactionCounts'})
means.head()

Unnamed: 0,streamerId,avgInteractionTime,interactionCounts
0,2987163,3.338468,71
1,2751748,2.243566,34
2,3048233,2.03125,21
3,3051309,1.709515,1093
4,1950,1.0,1


In [32]:
# used to ensure no oom error with pandas
#df.to_spark().coalesce(1).write.format('csv').option('header', 'true').save('data/collab')

In [33]:
means.to_spark().coalesce(1).write.format('csv').mode("overwrite").option('header', 'true').save('data/item')