In [1]:
import pyspark

from pyspark import SparkContext, SQLContext
sc = pyspark.SparkContext()
sqlc = SQLContext(sc)

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt


In [3]:
import os
from pyspark.sql import functions as F, Window
from pyspark.sql.functions import *
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder
import numpy as np


In [4]:
class ActionsData:
    
    def __init__(self,folder,file,file_repos,user,date,item):
        self._data=None
        self._data_items=None
        self.load_data(folder,file,user,date,item)
        self.load_items(folder,file_repos)
            
    def load_data(self,foldername,filename,user,date,item):
        """load interactions data of users with repositories"""
        file=os.path.join(foldername+filename)
        data=sqlc.read.json(file)
        data=data.select(col(user).alias('user_id'),col(date).alias('created_at'), col(item).alias('repo_id'))
        self._data=data
        
    def load_items(self,foldername,filename):
        """load informations about repositories"""
        file_repos=os.path.join(foldername+filename)
        data_repos=sqlc.read.json(file_repos)
        self._data_items=data_repos.select(col('id').alias('repo_id'),'name','language').distinct()
    
    def join_w_repos(self):
        """consider only interactions to repositories contained in self._data_items"""
        self._data=self._data.join(self._data_items,'repo_id','inner')
    
    
    def remove_duplicates(self):
        """remove duplicated of interactions of a user with the same repository"""
        self._data=self._data.sort('user_id','created_at',ascending=True).dropDuplicates(['user_id','repo_id'])

    def filter_actions(self,min_actions,max_actions):
        """filter out users inactive users (users who interacted with less than min_actions
        repositories) and outliers (users who interacted with more than max_actions repositories)"""
        data_with_max=self._data.groupby('user_id').agg(F.count('repo_id').alias('total_actions'))
        data_filter=data_with_max.filter((data_with_max.total_actions>min_actions)\
                                           & (data_with_max.total_actions<max_actions))
        
        self._data=self._data.join(data_filter.select('user_id'),'user_id','inner')

        
    def add_rating(self,rating):
        """add a column with rating value: in a class instance each interaction has the same value"""
        self._data=self._data.groupby('user_id','created_at','repo_id')\
                            .agg((F.count('*')*rating).alias('rating'))
        
        
    def transform(self,min_actions,max_actions,rating):
        """apply data transformations"""
        self.join_w_repos()
        self.remove_duplicates()
        self.filter_actions(min_actions,max_actions)
        self.add_rating(rating)

In [5]:
class SimpleRecommender:
    
    def __init__(self,data):
        
        self._data=data
        self._train=None
        self._test=None
        self._model=None
        self._predictions_train=None
        self._predictions_test=None
    
    
    def message(self,x):
        print(x)
        
    def split_train_test(self):
        self._train=self._data.filter('number_of_actions<total_actions')
        self._test=self._data.filter('number_of_actions=total_actions')
         
    def fit(self,param):
        self.split_train_test()
        als = ALS(maxIter=param['iter'],rank=param['rank'],regParam=param['reg'],userCol="user_idn",\
                    itemCol="repo_idn",ratingCol="rating", seed=1, coldStartStrategy='drop')
        evaluator_reg=RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")      
        model=als.fit(self._train)
        self._model=model
        self._predictions_train=model.transform(self._train)
        train_rmse=evaluator_reg.evaluate(self._predictions_train)
        self.message('Train RMSE=' + str(train_rmse))
        self._predictions_test=model.transform(self._test)
        test_rmse=evaluator_reg.evaluate(self._predictions_test)
        self.message('Test RMSE=' + str(test_rmse))
        

## Sample forks data

### Load forks data: consider only users who forked between 5 and 2500 repositories. Each fork has rating value=1

In [6]:
forks=ActionsData(folder='./data/',file='projects_forked_2017.json',\
                  file_repos='projects_not_forked_2017.json',\
                  user='owner_id',date='created_at',item='forked_from')

In [7]:
forks.transform(min_actions=5, max_actions=2500,rating=1)

In [8]:
forks_data=forks._data


In [11]:
forks_data.select('user_id').distinct().count()

28280

### Take sample of items

In [9]:
forks_by_item=forks_data.groupby('repo_id').agg(F.count('*').alias('number_of_forks'))
forks_by_item=forks_by_item.sort('number_of_forks',ascending=False).limit(1000)

In [10]:
forks_sample=forks_data.join(forks_by_item,'repo_id','inner')

In [21]:
forks_sample

DataFrame[repo_id: string, user_id: string, created_at: string, rating: bigint, number_of_forks: bigint]

In [11]:
forks_sample=forks_sample.select('user_id','repo_id','created_at','rating').withColumn('event',lit('fork'))

In [12]:
forks_sample.cache()

DataFrame[user_id: string, repo_id: string, created_at: string, rating: bigint, event: string]

In [15]:
forks_sample.select('user_id').distinct().count()

16437

### Create a new column (total_forks) as the total number of forks by user. This will be used to split the data into train and test

In [13]:
w_forks=(Window.partitionBy('user_id').orderBy('created_at').rowsBetween(Window.unboundedPreceding, Window.currentRow))
forks_only_sample=forks_sample.withColumn('number_of_actions',F.count('user_id').over(w_forks))

In [14]:
total_forks=forks_only_sample.groupby('user_id').agg(F.max('number_of_actions').alias('total_forks'))
forks_only_sample=forks_only_sample.join(total_forks,'user_id','inner')

### Take users with at least 10 forks and save train and test data for evaluation with UR

In [15]:
forks_only_sample=forks_only_sample.filter('total_forks>=10')

In [16]:
train_forks=forks_only_sample.filter('number_of_actions<total_forks')
test_forks=forks_only_sample.filter('number_of_actions=total_forks')

In [31]:
test_forks.count()

2185

In [34]:
train_forks

DataFrame[user_id: string, repo_id: string, created_at: string, rating: bigint, event: string, number_of_actions: bigint, total_forks: bigint]

In [37]:
train_forks.coalesce(1).write.format('csv').mode('overwrite').save('/data/forks_sample_train.csv')
test_forks.coalesce(1).write.format('csv').mode('overwrite').save('/data/forks_sample_test.csv')


### Load stars data: consider only users who forked between 20 and 7000 repositories. Each fork has rating value=1

In [20]:
stars=ActionsData(folder='./data/',file='watchers_2017.json',\
                  file_repos='projects_not_forked_2017.json',\
                  user='w_user_id',date='w_created_at',item='w_repo_id')

In [21]:
stars.transform(min_actions=20, max_actions=7000,rating=1)

In [22]:
stars_data=stars._data

### Take sample of items

In [23]:
stars_by_item=stars_data.groupby('repo_id').agg(F.count('*').alias('number_of_stars'))
stars_by_item=stars_by_item.sort('number_of_stars',ascending=False).limit(700)

In [24]:
stars_sample=stars_data.join(stars_by_item,'repo_id','inner')

In [25]:
stars_sample=stars_sample.select('user_id','repo_id','created_at','rating').withColumn('event',lit('star'))

In [26]:
stars_sample.cache()

DataFrame[user_id: string, repo_id: string, created_at: string, rating: bigint, event: string]

### Create the union data of forks and stars and drop duplicates (users that forked and starred the same repository)

In [27]:
union=forks_sample.union(stars_sample)
union_dd=union.sort('user_id','created_at').dropDuplicates(['user_id','repo_id'])
union_dd.cache()

DataFrame[user_id: string, repo_id: string, created_at: string, rating: bigint, event: string]

### Create a new column (total_actions) as the total number of interactions (forks or stars) by user. This will be used to split the data into train and test set

In [28]:
w=(Window.partitionBy('user_id').orderBy('created_at').rowsBetween(Window.unboundedPreceding, Window.currentRow))
union_dd=union_dd.withColumn('number_of_actions',F.count('user_id').over(w))

In [29]:
total_actions=union_dd.groupby('user_id').agg(F.max('number_of_actions').alias('total_actions'))
union_dd=union_dd.join(total_actions,'user_id','inner')

### Create numeric IDs for users and repos

In [30]:
indexer_user=StringIndexer(inputCol="user_id",outputCol="user_idn")#.setHandleInvalid('skip')
indexer_repo=StringIndexer(inputCol='repo_id',outputCol='repo_idn')
union_dd=indexer_user.fit(union_dd).transform(union_dd)
union_dd=indexer_repo.fit(union_dd).transform(union_dd)

In [26]:
union_dd.filter('user_idn=15972').show()

+-------+--------+--------------------+------+-----+-----------------+-------------+--------+--------+
|user_id| repo_id|          created_at|rating|event|number_of_actions|total_actions|user_idn|repo_idn|
+-------+--------+--------------------+------+-----+-----------------+-------------+--------+--------+
|2041471|57211856|2017-02-15 21:46:...|     1| star|                1|           15| 15972.0|   156.0|
|2041471|57724745|2017-02-21 04:40:...|     1| star|                2|           15| 15972.0|   528.0|
|2041471|57090153|2017-03-02 02:34:...|     1| star|                3|           15| 15972.0|    78.0|
|2041471|59138123|2017-03-09 21:33:...|     1| star|                4|           15| 15972.0|     2.0|
|2041471|60377505|2017-03-28 05:26:...|     1| star|                5|           15| 15972.0|    16.0|
|2041471|61091932|2017-04-01 00:21:...|     1| star|                6|           15| 15972.0|    54.0|
|2041471|59514446|2017-04-15 04:36:...|     1| star|                7|   

In [24]:
union_dd.count()

748606

### Take only users with at least 10 actions

In [31]:
union_dd=union_dd.filter('total_actions>=10')

In [26]:
union_dd.count()

652596

In [28]:
union_dd.select('user_id').distinct().count()

24225

In [36]:
union_dd.write.format('json').mode('overwrite').save('/data/forks_stars_sample.json')

### Create a model with SimpleRecommender

In [32]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
rec=SimpleRecommender(union_dd)

In [30]:
parameters={'rank':10,'iter':10,'reg':0.1}
rec.fit(param=parameters)

model_final=rec._model


Train RMSE=0.0985648382645678
Test RMSE=0.10027968290201808


## Save model and train and test set

In [31]:
model_final.save('/data/als_sample_r10_i10_reg01_fs.parquet')

In [34]:
train=rec._train
test=rec._test

In [35]:
train

In [45]:
train.coalesce(1).write.format('csv').mode('overwrite').save('/data/forks_stars_sample_train.csv')
test.coalesce(1).write.format('csv').mode('overwrite').save('/data/forks_stars_sample_test.csv')


In [38]:
sc.stop()