In [8]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import HiveContext
from pyspark.sql import functions as F
from pyspark.sql.types import *

from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import Word2Vec

import pandas as pd

import re

import random

import socket

import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
driver_ip = socket.gethostbyname(socket.gethostname())

In [10]:
conf = pyspark.SparkConf().setAll([('spark.kubernetes.authenticate.caCertFile', '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'), \
                                   ('spark.kubernetes.authenticate.oauthTokenFile','/var/run/secrets/kubernetes.io/serviceaccount/token'), \
                                   ('spark.kubernetes.authenticate.driver.serviceAccountName','spark-driver-sa'), \
                                   ('spark.kubernetes.namespace','spark'), \
                                   ('spark.driver.pod.name','spark-driver'), \
                                   ('spark.executor.instances','8'), \
                                   ('spark.kubernetes.container.image','gcr.io/sarcasm-3wx3ce6drvftuy/spark-v2.4.4-worker:latest'), \
                                   ('spark.driver.host','spark-driver.spark.svc.cluster.local'), \
                                   ('spark.driver.port','29413'), \
                                   ('spark.driver.bindAddress',driver_ip), \
                                   ('spark.executor.memory','5000m'), \
                                   ('spark.executor.cores','1'), \
                                   ('spark.kubernetes.driverEnv.GCS_PROJECT_ID', 'sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.kubernetes.driverEnv.GOOGLE_APPLICATION_CREDENTIALS', '/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.kubernetes.driver.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.kubernetes.executor.secrets.sarc-bucket-sa','/mnt/secrets'), \
                                   ('spark.executorEnv.GCS_PROJECT_ID','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.executorEnv.GOOGLE_APPLICATION_CREDENTIALS','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.enable','true'), \
                                   ('spark.hadoop.google.cloud.auth.service.account.json.keyfile','/mnt/secrets/sarc-bucket-sa.json'), \
                                   ('spark.hadoop.fs.gs.project.id','sarcasm-3wx3ce6drvftuy'), \
                                   ('spark.hadoop.fs.gs.system.bucket','sarc-bucket-3wx3ce6drvftuy')])

In [11]:
spark = SparkSession.builder.master("k8s://https://kubernetes.default.svc.cluster.local:443").appName("sarc").config(conf=conf).getOrCreate()

In [12]:
sc = spark.sparkContext

In [13]:
# Read in truncated sarc table:

%time tsarc = spark.read.csv("gs://sarc-bucket-3wx3ce6drvftuy/science.csv", inferSchema=True, header=False, sep = ',')



CPU times: user 3 ms, sys: 2.39 ms, total: 5.38 ms
Wall time: 17.6 s


In [22]:
# Rename columns: 

tsarc = tsarc.withColumnRenamed('_c0','label').withColumnRenamed('_c1','subreddit').withColumnRenamed('_c2','context')


In [14]:
# Get number of samples:

%time tsarc.count()

CPU times: user 1.72 ms, sys: 247 µs, total: 1.97 ms
Wall time: 4.39 s


242514

In [24]:
samp = tsarc.sample(False, 0.1, 1)

In [25]:
print(samp.count())
samp.show()

24256
+-----+---------+--------------------+
|label|subreddit|             context|
+-----+---------+--------------------+
|    0|  science|A second age of A...|
|    0|  science|Why the hell woul...|
|    0|  science|"""The verdict is...|
|    0|  science|Electric Bugs: Ne...|
|    0|  science|NASA's Guide to v...|
|    0|  science|Astronomers disco...|
|    0|  science|What a worthless ...|
|    0|  science|even if there wer...|
|    0|  science|I need a device l...|
|    0|  science|This is how feel,...|
|    0|  science|Sigh. Don't drive...|
|    0|  science|Decoding brainwav...|
|    0|  science|TIL there is an e...|
|    0|  science|Wat. If you force...|
|    0|  science|"...because diabe...|
|    0|  science|One in 10 kids fo...|
|    0|  science|Drinking in the s...|
|    0|  science|Do animals taste ...|
|    0|  science|"My to do list fo...|
|    0|  science|It's cute that yo...|
+-----+---------+--------------------+
only showing top 20 rows



In [6]:
sc.stop()