In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz
!tar -xvf spark-3.0.1-bin-hadoop3.2.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop3.2"
import findspark
findspark.init()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark import SparkConf
from pyspark.context import SparkContext
sc = SparkContext.getOrCreate(SparkConf())
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pyspark.sql.functions as f
import pyspark.sql.types as types
from pyspark import sql
from pyspark.sql.functions import unix_timestamp
from time import time

In [None]:
spark = sql.SparkSession.builder \
    .master("local") \
    .appName("ML Youtube") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [None]:
from urllib.request import urllib
General_url = "https://billypoon.blob.core.windows.net/project/data/processed"

CA_url = General_url+"/CA.csv"
IN_url = General_url+"/IN.csv"
GB_url = General_url+"/GB.csv"
US_url = General_url+"/US.csv"
DE_url = General_url+"/DE.csv"
FR_url = General_url+"/FR.csv"

CA_file = "CA.csv"
IN_file = "IN.csv"
GB_file = "GB.csv"
US_file = "US.csv"
DE_file = "DE.csv"
FR_file = "FR.csv"

urllib.request.urlretrieve(CA_url, CA_file)
urllib.request.urlretrieve(IN_url, IN_file)
urllib.request.urlretrieve(GB_url, GB_file)
urllib.request.urlretrieve(US_url, US_file)
urllib.request.urlretrieve(DE_url, DE_file)
urllib.request.urlretrieve(FR_url, FR_file)

('FR.csv', <http.client.HTTPMessage at 0x7f030dfce208>)

In [None]:
custom_schema = StructType([
    StructField('video_id', StringType(), False),
    StructField('title', StringType(), False),
    StructField('category_id', IntegerType(), False),
    StructField('tags', StringType(), False),
    StructField('views', IntegerType(), False),
    StructField('likes', IntegerType(), False),
    StructField('dislikes', IntegerType(), False),
    StructField('comment_count', IntegerType(), False),
    StructField('description', StringType(), False),
    StructField('category_title', StringType(), False),
    StructField('region', StringType(), False),
    StructField('lang', StringType(), False)
])

df_IN = spark.read.csv(IN_file, header=True, schema=custom_schema, multiLine=True)
df_GB = spark.read.csv(GB_file, header=True, schema=custom_schema, multiLine=True)
df_US = spark.read.csv(US_file, header=True, schema=custom_schema, multiLine=True)
df_CA = spark.read.csv(CA_file, header=True, schema=custom_schema, multiLine=True)
df_DE = spark.read.csv(DE_file, header=True, schema=custom_schema, multiLine=True)
df_FR = spark.read.csv(FR_file, header=True, schema=custom_schema, multiLine=True)
df_ALL = df_IN.union(df_GB).union(df_US).union(df_CA).union(df_DE).union(df_FR)

# Data Preprocessing

In [None]:
def viewsRange(x):
    if x <= 10000:
        return 0.
    elif x <= 50000:
        return 1.
    elif x <= 100000:
        return 2.
    elif x <= 250000:
        return 3.
    elif x <= 500000:
        return 4.
    elif x <= 750000:
        return 5.
    elif x <= 1000000:
        return 6.
    elif x <= 1500000:
        return 7.
    elif x <= 2500000:
        return 8.
    else:
        return 9.

viewsRangeUDF = udf(lambda x: viewsRange(x), FloatType())

In [None]:
df_ALL = df_ALL.withColumn('viewlabel', viewsRangeUDF(df_ALL['views']))
df_ALL = df_ALL.withColumn('log_views', f.log10(df_ALL.views+1))
df_ALL = df_ALL.withColumn('log_likes', f.log10(df_ALL.likes+1))
df_ALL = df_ALL.withColumn('log_dislikes', f.log10(df_ALL.dislikes+1))
df_ALL = df_ALL.withColumn('log_comment_count', f.log10(df_ALL.comment_count+1))

# Views Classification

### Vectorize Features

In [None]:
viewData = df_ALL
required_features = [
    'category_id',
    'log_likes',     
    'log_dislikes',
    'log_comment_count'
]
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
start_time = time()
data_feature = assembler.transform(viewData)
data_feature = data_feature.select(col('viewlabel').alias('label'),data_feature['features'])
print(f"Time elapsed for featurizing data: {time()-start_time:.2f}s")
del start_time

Time elapsed for featurizing data: 0.33s


In [None]:
## Data_split
data_train, data_test = data_feature.randomSplit([0.8, 0.2])
data_train.cache()
data_test.cache()
start_time = time()
data_train.collect()
data_test.collect()
print(f"Time elapsed for collecting data: {time()-start_time:.2f}s")
del start_time

Time elapsed for collecting data: 22.57s


### Models

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC, OneVsRest, NaiveBayes, FMClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.regression import DecisionTreeRegressor, GBTRegressor

In [None]:
lr1 = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True, labelCol='label', featuresCol='features')
lr2 = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True, labelCol='label', featuresCol='features')
dt3 = DecisionTreeRegressor(labelCol='label', featuresCol='features')
rf4 = RandomForestClassifier(labelCol='label', featuresCol='features',maxDepth=5)
gbt5 = GBTClassifier(labelCol='label', featuresCol='features',maxIter=10)
mlp6 = MultilayerPerceptronClassifier(maxIter=50, layers=[11], blockSize=32)
mlp7 = MultilayerPerceptronClassifier(maxIter=100, layers=[16,11], blockSize=32)
lsvc8 = LinearSVC(maxIter=10, regParam=0.1)
fmc9 = FMClassifier(labelCol='label', featuresCol='features', stepSize=0.001)
cls1 = OneVsRest(classifier=lr1)
cls2 = lr2
cls3 = OneVsRest(classifier=dt3)
cls4 = OneVsRest(classifier=rf4)
cls5 = OneVsRest(classifier=gbt5)
cls6 = mlp6
cls7 = mlp7
cls8 = OneVsRest(classifier=lsvc8)
cls9 = OneVsRest(classifier=fmc9)
cls = [cls1, cls2, cls3, cls4, cls5]

In [None]:
models = []
for i in range(len(cls)):
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
    start_time = time()
    model = cls[i].fit(data_train)
    print(f"Time elapsed for training data {i}: {time()-start_time:.2f}s")
    del start_time
    start_time = time()
    predictions = model.transform(data_test)
    accuracy = evaluator.evaluate(predictions)
    print(f"Test Error for data {i} = %g" % (1.0 - accuracy))
    print(f"Time elapsed for evaluating data {i}: {time()-start_time:.2f}s")
    del start_time
    models.append(model)

# Views Regression

### Vectorization

In [None]:
viewData = df_ALL
required_features = [
    'category_id',
    'log_likes',     
    'log_dislikes',
    'log_comment_count'
]
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
start_time = time()
data_feature = assembler.transform(viewData)
data_feature = data_feature.select(col('log_views').alias('label'),data_feature['features'])
print(f"Time elapsed for featurizing data: {time()-start_time:.2f}s")
del start_time

In [None]:
## Data_split
data_train, data_test = data_feature.randomSplit([0.8, 0.2])
data_train.cache()
data_test.cache()
start_time = time()
data_train.collect()
data_test.collect()
print(f"Time elapsed for collecting data: {time()-start_time:.2f}s")
del start_time

Time elapsed for collecting data: 25.66s


### Models

In [None]:
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer
from pyspark.ml import Pipeline

In [None]:
reg1 = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)
reg2 = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=100, regParam=0.3)

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=40).fit(data_feature)
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
reg3 = Pipeline(stages=[featureIndexer, dt])

rf = RandomForestRegressor(featuresCol="indexedFeatures") 
reg4 = Pipeline(stages=[featureIndexer, rf])

gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=50)
reg5 = Pipeline(stages=[featureIndexer, gbt])

regs = [reg1, reg2, reg3, reg4, reg5]
names = ['Linear Regression', 'Generalized Linear Regression', 
         'Decision Tree Regressor', 'Random Forest Regressor',
         'GBT Regressor']

In [None]:
models = []
acc_list = []
for i in range(len(regs)):
    evaluator = RegressionEvaluator(metricName='r2')
    start_time = time()
    model = regs[i].fit(data_train)
    print(f"Time elapsed for training {names[i]}: {time()-start_time:.2f}s")
    del start_time
    start_time = time()
    predictions = model.transform(data_test)
    accuracy = evaluator.evaluate(predictions)
    print(f"R2 score for {names[i]} = %g" % (accuracy))
    print(f"Time elapsed for evaluating {names[i]}: {time()-start_time:.2f}s")
    print("==================================================================")
    acc_list.append(accuracy)
    del start_time
    models.append(model)

Time elapsed for training Linear Regression: 1.15s
R2 score for Linear Regression = 0.685498
Time elapsed for evaluating Linear Regression: 0.23s
Time elapsed for training Generalized Linear Regression: 0.46s
R2 score for Generalized Linear Regression = 0.765101
Time elapsed for evaluating Generalized Linear Regression: 0.30s
Time elapsed for training Decision Tree Regressor: 2.14s
R2 score for Decision Tree Regressor = 0.828102
Time elapsed for evaluating Decision Tree Regressor: 0.23s
Time elapsed for training Random Forest Regressor: 5.45s
R2 score for Random Forest Regressor = 0.846993
Time elapsed for evaluating Random Forest Regressor: 0.41s
Time elapsed for training GBT Regressor: 130.65s
R2 score for GBT Regressor = 0.867724
Time elapsed for evaluating GBT Regressor: 0.93s


In [None]:
plt.scatter(names,acc_list)
plt.plot(names, acc_list)
plt.xticks(rotation=45)
plt.show()

# Like ratio Prediction

In [None]:
df_ALL = df_ALL.withColumn('log_likes_ratio', f.log10((df_ALL.likes+1)/(df_ALL.views+1)))

In [None]:
viewData = df_ALL
required_features = [
    'category_id',
    'log_views',     
    'log_dislikes',
    'log_comment_count'
]
assembler = VectorAssembler(inputCols=required_features, outputCol='features')
start_time = time()
data_feature = assembler.transform(viewData)
data_feature = data_feature.select(col('log_likes_ratio').alias('label'),data_feature['features'])
print(f"Time elapsed for featurizing data: {time()-start_time:.2f}s")
del start_time

Time elapsed for featurizing data: 0.03s


In [None]:
## Data_split
data_train, data_test = data_feature.randomSplit([0.8, 0.2])
data_train.cache()
data_test.cache()
start_time = time()
data_train.collect()
data_test.collect()
print(f"Time elapsed for collecting data: {time()-start_time:.2f}s")
del start_time

Time elapsed for collecting data: 12.70s


## Models

In [None]:
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer
from pyspark.ml import Pipeline

In [None]:
reg1 = LinearRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)
reg2 = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=100, regParam=0.3)

featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=40).fit(data_feature)
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
reg3 = Pipeline(stages=[featureIndexer, dt])

rf = RandomForestRegressor(featuresCol="indexedFeatures") 
reg4 = Pipeline(stages=[featureIndexer, rf])

gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=50)
reg5 = Pipeline(stages=[featureIndexer, gbt])

regs = [reg1, reg2, reg3, reg4, reg5]
names = ['Linear Regression', 'Generalized Linear Regression', 
         'Decision Tree Regressor', 'Random Forest Regressor',
         'GBT Regressor']

In [None]:
models = []
acc_likelist = []
for i in range(len(regs)):
    evaluator = RegressionEvaluator(metricName='r2')
    start_time = time()
    model = regs[i].fit(data_train)
    print(f"Time elapsed for training {names[i]}: {time()-start_time:.2f}s")
    del start_time
    start_time = time()
    predictions = model.transform(data_test)
    accuracy = evaluator.evaluate(predictions)
    print(f"R2 score for {names[i]} = %g" % (accuracy))
    print(f"Time elapsed for evaluating {names[i]}: {time()-start_time:.2f}s")
    print("==================================================================")
    acc_likelist.append(accuracy)
    del start_time
    models.append(model)

Time elapsed for training Linear Regression: 0.95s
R2 score for Linear Regression = 0.0167303
Time elapsed for evaluating Linear Regression: 0.22s
Time elapsed for training Generalized Linear Regression: 0.40s
R2 score for Generalized Linear Regression = 0.25456
Time elapsed for evaluating Generalized Linear Regression: 0.25s
Time elapsed for training Decision Tree Regressor: 2.08s
R2 score for Decision Tree Regressor = 0.56605
Time elapsed for evaluating Decision Tree Regressor: 0.31s
Time elapsed for training Random Forest Regressor: 5.75s
R2 score for Random Forest Regressor = 0.592959
Time elapsed for evaluating Random Forest Regressor: 0.36s
Time elapsed for training GBT Regressor: 55.89s
R2 score for GBT Regressor = 0.741326
Time elapsed for evaluating GBT Regressor: 0.78s


In [None]:
plt.scatter(names,acc_likelist)
plt.plot(names, acc_likelist)
plt.xticks(rotation=45)
plt.show()