### Data Mining Assignment #3

#### Group 27: Max Beinhauer, Davis Siemens
#### Dataset: https://snap.stanford.edu/data/ego-Twitter.html

In [11]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from pyspark.sql import Row


In [12]:
# Spark stop in case error occurs during execution
# spark.stop()

### Sanity checks on graph

In [None]:
spark = (
    SparkSession.builder
    .appName("Assignment2")
    .config(
        "spark.jars.packages",
        "io.graphframes:graphframes-spark4_2.13:0.10.0"
    )
    .getOrCreate()
)


# Read the graph file
file_path = "data/twitter_combined.txt"
graph_rdd = spark.sparkContext.textFile(file_path)

# Show first few lines
print("Sample lines from the graph file:")
print(graph_rdd.take(5))

# Create edges RDD (u, v)
edges = graph_rdd.map(lambda line: tuple(map(int, line.split()))) 

# Make edges undirected by sorting endpoints, e.g. (3, 10) and (10, 3) -> (3, 10)
undirected_edges = edges.map(lambda e: (min(e[0], e[1]), max(e[0], e[1])))

# Remove duplicate undirected edges
undirected_edges_df = undirected_edges.distinct().map(
    lambda e: Row(src=e[0], dst=e[1])
)

print("Distinct undirected edges:", undirected_edges_df.count())

# Create vertices DataFrame
vertices = (
    undirected_edges_df
    .flatMap(lambda edge: edge)
    .distinct()
    .map(lambda vid: Row(id=vid))
    .toDF()
)

# Create edges DataFrame
edges_df = undirected_edges_df.map(lambda e: Row(src=e[0], dst=e[1])).toDF()

# Build GraphFrame
graph = GraphFrame(vertices, edges_df)

# Show stats

print("Expected Vertices: 81306 - Actual Vertices: ", graph.vertices.count())
print("Edges:", graph.edges.count())

Sample lines from the graph file:
['214328887 34428380', '17116707 28465635', '380580781 18996905', '221036078 153460275', '107830991 17868918']


                                                                                

Distinct undirected edges: 1342310


                                                                                

Expected Vertices: 81306 - Actual Vertices:  81306


[Stage 98:>                                                         (0 + 2) / 2]

Edges: 1342310


                                                                                

25/11/19 12:51:23 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 913853 ms exceeds timeout 120000 ms
25/11/19 12:51:24 WARN SparkContext: Killing executors is not supported by current scheduler.
25/11/19 12:51:24 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:81)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:669)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1296)
	at o

### Part 1: Reservoir Sampling 