## Import Spark

In [2]:
import findspark
findspark.init()

## Exercise 2:
### - Given a dataset of 10,000,000 arbitrary numbers in a text file
### - Find all prime numbers in the given dataset
### - Save result under a new text file


**Generate a dataset of 10M arbitrary numbers and save to a text file**

In [13]:
import random

# Generate 1000 random numbers
random_numbers = [random.randint(1, 100000000) for _ in range(10000000)]

# Save the numbers to a text file
with open("numbers.txt", "w") as file:
    for number in random_numbers:
        file.write(f"{number}\n")

print("Dataset of 10,000,000 arbitrary numbers has been generated and saved to numbers.txt")

Dataset of 10,000,000 arbitrary numbers has been generated and saved to numbers.txt


**Check for numbers in the file**

In [14]:
with open("numbers.txt") as file:
    line_count = sum(1 for line in file)

print(f"Number of lines in the file: {line_count}")

Number of lines in the file: 10000000


**Create a Spark Session**

In [15]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PrimeNumberFinder") \
    .getOrCreate()


**Load the dataset**

In [16]:
numbers_rdd = spark.sparkContext.textFile("numbers.txt")

In [17]:
# Convert the numbers in RDD to integers
numbers_rdd = numbers_rdd.map(lambda x: int(x))

**Define a function to check if a number is prime**

In [18]:
def is_prime(n):
    if n <= 1:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    max_divisor = int(n**0.5) + 1
    for d in range(3, max_divisor, 2):
        if n % d == 0:
            return False
    return True


In [19]:
# Filter the prime numbers
prime_numbers_rdd = numbers_rdd.filter(is_prime)

In [20]:
prime_numbers_rdd.collect()

[99878633,
 14868577,
 89336851,
 2103671,
 56559907,
 60772223,
 64829929,
 95869789,
 69817933,
 211543,
 3387817,
 88836073,
 87962429,
 77973949,
 74994317,
 98952983,
 9759539,
 60404671,
 77258759,
 50690791,
 20807947,
 14607389,
 32450227,
 55848797,
 69959321,
 49064077,
 13900501,
 74813113,
 22177657,
 80269627,
 74532907,
 7774601,
 36234503,
 29675969,
 13907603,
 42355919,
 24064267,
 46238371,
 62638439,
 58541849,
 16974499,
 14810347,
 47175617,
 28506749,
 12733751,
 7584977,
 12758719,
 5576839,
 83365187,
 74721161,
 225373,
 71363471,
 53070301,
 89935843,
 11292769,
 70746127,
 26899793,
 23066717,
 84657967,
 79233503,
 43570883,
 93084727,
 84084331,
 1573771,
 63026539,
 22905979,
 73527263,
 2405369,
 93024653,
 15206161,
 27383381,
 99384661,
 49092149,
 51660559,
 36714277,
 84429823,
 93589417,
 94664639,
 34993747,
 33129871,
 97266557,
 20771633,
 22937207,
 62609293,
 82827967,
 61620071,
 3983131,
 71005829,
 59656591,
 63999931,
 24523013,
 28962431,
 

In [21]:
prime_numbers_rdd.count()

576254

**Save result under a new text file**


In [22]:
prime_numbers_rdd.saveAsTextFile("prime_numbers.txt")

In [10]:
spark.stop()