In [1]:
!pip install pyarrow
!pip install pyspark
!pip install pandas

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 25 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 45.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=bab9778fece046fd20d7a604c48b00bf49c03b32af87604c00383cc70c43e841
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


In [2]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
#os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'

from pyspark.pandas import read_csv
import pyspark.pandas as ps
import pandas as pd
# means is for items, df is for collab

In [3]:
ps.set_option('compute.default_index_type', 'distributed')
big_data = False
path ='100k_a.csv' if  not big_data else 'full_a.csv'
df = read_csv(path, names=['userId','streamId','streamerName','timeStart','timeStop'])

In [None]:
df.head()

In [4]:
df['interactionTime']=(df['timeStop'] - df['timeStart'])*10 # units are 10 min

In [None]:
df.head()

In [5]:
df= df.drop(columns=['timeStart','timeStop'])

In [6]:
tmax=df['interactionTime'].max()

In [7]:
tmin=df['interactionTime'].min()

In [8]:
df['interactionTime']= (df['interactionTime']- tmin)/(tmax-tmin)

In [9]:
df.head()

Unnamed: 0,userId,streamId,streamerName,interactionTime
0,1,33842865744,mithrain,0.012195
1,1,33846768288,alptv,0.02439
2,1,33886469056,mithrain,0.0
3,1,33887624992,wtcn,0.012195
4,1,33890145056,jrokezftw,0.02439


In [10]:
# goal is to scale to 1-100 for implict rating/confidence
# MAYBE INVESTIGATE DIFFERENT SCALING IN FUTURE
df['interactionTime'] = (df['interactionTime'] * 99) +1

In [None]:
df.head()

In [None]:
# Create dictionary for streamer names
streamer_dict = {k: v for v, k in enumerate(df['streamerName'].to_numpy())}
streamer_dict

In [19]:
# Mapping function for streamer name to their respective ID
def tuple_to_value(x):
  return streamer_dict.get(x)

In [20]:
# Create streamerID column based on dictionary mapping
df['streamerId']=df['streamerName'].apply(lambda x: tuple_to_value(x)) # map streamerId

In [21]:
df.head()

Unnamed: 0,userId,streamId,streamerName,interactionTime,streamerId
0,1,33842865744,mithrain,2.207317,862716
1,1,33846768288,alptv,3.414634,854262
2,1,33886469056,mithrain,1.0,862716
3,1,33887624992,wtcn,2.207317,862770
4,1,33890145056,jrokezftw,3.414634,862769


In [22]:
len(df['streamerId'].unique()) == len(df['streamerName'].unique())

True

In [23]:
df = df.drop(columns=['streamerName'])

In [25]:
df_streamer = df.drop(columns=['streamId', 'streamerName'])

In [26]:
df_streamer.to_spark().coalesce(1).write.format('csv').option('header', 'true').save('data/collab')

In [None]:
df.head()

In [None]:
means=df.groupby(['streamerId'],as_index=False)['interactionTime'].mean()
means=means.rename(columns={'streamerId':'streamerId','interactionTime':'avgInteractionTime'})

In [None]:
means.head()

Unnamed: 0,streamerId,avgInteractionTime
0,5652140647009628783,2.902439
1,392638289472592052,1.54878
2,6900454007790282002,3.278598
3,2406168726911974914,8.780488
4,2271347389753034915,4.085366


In [None]:
numStreams = df.groupby(['streamerId'],as_index=False).size()
numStreams.max()

8760

In [None]:
means=means.join(numStreams,on='streamerId')

In [None]:
means= means.rename(columns={'streamerId':'streamerId','avgInteractionTime':'avgInteractionTime', 0:'interactionCounts'})
means.head()

Unnamed: 0,streamerId,avgInteractionTime,interactionCounts
0,5652140647009628783,2.902439,33
1,392638289472592052,1.54878,11
2,6900454007790282002,3.278598,284
3,2406168726911974914,8.780488,9
4,2271347389753034915,4.085366,27


### better for smaller in memory data
### df.to_pandas().to_csv('data/test.csv')

In [None]:
# used to ensure no oom error with pandas
df.to_spark().coalesce(1).write.format('csv').option('header', 'true').save('data/collab')

In [None]:
means.to_spark().coalesce(1).write.format('csv').option('header', 'true').save('data/item')

## Attempt Collaborative Filtering

In [None]:
# Rename columns
df = pd.read_csv('collab_filter_100k.csv')
collab = df.drop(columns=['streamId'])
collab = collab.rename(columns={'streamerId':'item_id','interactionTime':'rating', 'userId': 'user_id'})

In [None]:
collab.head()

Unnamed: 0,user_id,rating,item_id
0,1,2.207317,6237787267506800044
1,1,3.414634,8455602530264088426
2,1,1.0,6237787267506800044
3,1,2.207317,5324476659562100413
4,1,3.414634,6731343583494858722


In [None]:
len(collab['item_id'].unique())

NameError: ignored

In [None]:
import pyspark as spark
spark.conf.set("spark.sql.pivotMaxValues", 20000)

AttributeError: ignored

In [None]:
# Make the matrix
rating_matrix = collab.pivot_table(index=['user_id'], columns='item_id', values='rating')
#temp_collab = collab.to_pandas()
#temp_collab.pivot_table(index=['user_id'], columns='item_id', values='rating')
#rating_matrix = collab.groupby("user_id").pivot('item_id').sum('rating')
#rating_matrix = temp_collab.copy()

# FOR NOW: replace NaN values with 0 - go back later to address
rating_matrix = rating_matrix.fillna(0)

rating_matrix.head()