In [1520]:
import findspark
findspark.init('/home/ek/spark-2.4.4-bin-hadoop2.7')
import pyspark
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64' # Set your own
os.environ['JAVA_HOME'] = java8_location
from pyspark.sql.functions import udf
from IPython.display import Image

In [1589]:
from pyspark.sql import SparkSession
from pyspark.mllib.evaluation import RegressionMetrics
import math

In [1522]:
spark = SparkSession.builder.appName('HW').getOrCreate()

### II Linear Regression

In [1449]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [1466]:
N_EXAMPLES = 1000
DIM = 15

In [1467]:
def generate_data(scaled):
    real_weights = np.random.randn(DIM)
    X = np.random.randn(N_EXAMPLES)
    X = np.tile(X,DIM).reshape(DIM,N_EXAMPLES)
    for i in range(DIM):
        X[i]=X[i]**i
    X = X.T
    if scaled:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    y = X@real_weights
    X_tr, X_te, y_tr, y_te = train_test_split(X,y,test_size = 0.2)
    return X_tr, X_te, y_tr, y_te

In [1468]:
X_tr, X_te, y_tr, y_te = generate_data(True)

In [1474]:
X_tr_US, X_te_US, y_tr_US, y_te_US = generate_data(False)

In [1495]:
class LinearRegressionMMD():
    def __init__(self, dimensions = DIM):
        self.weights = np.random.rand(dimensions)
        print(self.weights.shape)
    def predict(self, inputs):
        return inputs@self.weights        
    
    def score(self, y_true, predictions):
        errors = y_true-predictions
        return errors , np.mean(errors**2)
    
    def compute_gradient(self, errors, inputs, scale_gradient, loss_function):
        if loss_function == 'MSE':
            gradient = -np.mean(errors[:,None]*inputs,0)
        else:
            gradient = -np.sum(errors[:,None]*inputs,0)
        if scale_gradient:
            return gradient/np.linalg.norm(gradient) 
        else:
            return gradient
        
    def update_weights(self, gradient,lr):
        self.weights-=lr*gradient
    
    def fit(self,inputs, y_true, epochs, scale_gradient, batch_size,lr, loss_function,verbose):
        for epoch in range(epochs):
            random_idx = np.random.permutation(range(0,N_EXAMPLES))
            epoch_loss = 0
            for i in range(epochs//batch_size):
                predictions = self.predict(inputs[i*batch_size:i*batch_size+batch_size])
                errors,loss = self.score(y_true[i*batch_size:i*batch_size+batch_size],
                                         predictions)
                gradient = self.compute_gradient(errors,
                                           inputs[i*batch_size:i*batch_size+batch_size],
                                           scale_gradient,
                                           loss_function)
                self.update_weights(gradient,lr)
                epoch_loss += loss
            if verbose:
                print('traning loss for {} epoch: {}'.format(epoch,epoch_loss/(epochs//batch_size)))

In [1511]:
model = LinearRegressionMMD()
model.fit(X_tr, y_tr,700, True, 32, lr = 0.001,loss_function='MSE',verbose=False)
_, loss = model.score(model.predict(X_te),y_te)
print('test loss: ', loss)

(15,)
test loss:  0.00847582649645226


In [1512]:
model = LinearRegressionMMD()
model.fit(X_tr, y_tr,700, True, 32, lr = 0.001,loss_function='SSE',verbose=False)
_, loss = model.score(model.predict(X_te),y_te)
print('test loss: ', loss)

(15,)
test loss:  0.015951388503102057


In [1513]:
model = LinearRegressionMMD()
# without gradient scaling
model.fit(X_tr, y_tr,700, False, 32, lr = 0.001,loss_function='MSE',verbose=False)
_, loss = model.score(model.predict(X_te),y_te)
print('test loss: ', loss)

(15,)
test loss:  0.16913282257406767


In [1514]:
model = LinearRegressionMMD()
# without gradient scaling
model.fit(X_tr, y_tr,400, False, 32, lr = 0.001,loss_function='SSE',verbose=False)
_, loss = model.score(model.predict(X_te),y_te)
print('test loss: ', loss)

(15,)
test loss:  0.013134794693781746


In [1517]:
# try without scaling of the data
model = LinearRegressionMMD()
model.fit(X_tr_US, y_tr_US,700, True, 32, lr = 0.001,loss_function='SSE',verbose=False)
_, loss = model.score(model.predict(X_te_US),y_te_US)
print('test loss: ', loss)

(15,)
test loss:  62394869501.81195


In [1516]:
# try without scaling of the data
model = LinearRegressionMMD()
# without scaling of the gradients
model.fit(X_tr_US, y_tr_US,700, False, 32, lr = 0.001,loss_function='SSE',verbose=False)
_, loss = model.score(model.predict(X_te_US),y_te_US)
print('test loss: ', loss)

(15,)
test loss:  nan


  # Remove the CWD from sys.path while we load stuff.
  
  app.launch_new_instance()
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [1572]:
from pyspark.mllib.recommendation import Rating, ALS, MatrixFactorizationModel

In [1562]:
dataset = spark.sparkContext.textFile('movielens.txt')

In [1563]:
dataset = dataset.map(lambda x: x.split('\t')[:-1]).map(lambda x: Rating(
                                                        int(x[0]),
                                                        int(x[1]),
                                                        float(x[2])))
dataset.take(4)

[Rating(user=196, product=242, rating=3.0),
 Rating(user=186, product=302, rating=3.0),
 Rating(user=22, product=377, rating=1.0),
 Rating(user=244, product=51, rating=2.0)]

### train test split

In [1568]:
train, test = dataset.randomSplit((0.5,0.5))

### train model

In [1570]:
model = ALS.train(train,rank = 10, iterations = 5)

In [1571]:
model.save(spark.sparkContext,'recomm')

In [1573]:
model = MatrixFactorizationModel.load(spark.sparkContext,'recomm')

In [1575]:
test_data =test.map(lambda x: (x.user, x.product))

In [1576]:
test_data.take(3)

[(196, 242), (186, 302), (22, 377)]

In [1577]:
predictions = model.predictAll(test_data).map(lambda r: ((r.user, r.product), r.rating))

In [1578]:
predictions.take(5)

[((280, 384), 3.390801714156854),
 ((497, 384), 2.360212608578044),
 ((545, 384), 3.298583416232403),
 ((393, 384), 3.170307394341294),
 ((682, 384), 2.3789953109099895)]

In [1580]:
ratings_real= test.map(lambda r: ((r.user, r.product), r.rating))

In [1581]:
ratings_real.take(5)

[((196, 242), 3.0),
 ((186, 302), 3.0),
 ((22, 377), 1.0),
 ((166, 346), 1.0),
 ((115, 265), 2.0)]

In [1586]:
scoreAndLabels = predictions.join(ratings_real).map(lambda x: x[1]) # get only the rating (without the user-user pairs)

In [1587]:
scoreAndLabels.take(5)

[(3.4902253287579152, 5.0),
 (2.7327856832552953, 3.0),
 (2.939121335387544, 4.0),
 (6.093582030546673, 5.0),
 (3.1502591140591343, 4.0)]

In [1590]:
metrics = RegressionMetrics(scoreAndLabels)

In [1591]:
metrics.meanSquaredError

1.4369439311564367