## Import Spark

In [1]:
import findspark
findspark.init()

## Exercise 2:
### - Given a dataset of 10,000,000 arbitrary numbers in a text file
### - Find all prime numbers in the given dataset
### - Save result under a new text file


**Generate a dataset of 10M arbitrary numbers and save to a text file**

In [1]:
import random

# Generate 1000 random numbers
random_numbers = [random.randint(1, 100000000) for _ in range(100000000)]

# Save the numbers to a text file
with open("F:/FPT/DE_assignments/080824/SparkScripts/input/numbers_large.txt", "w") as file:
    for number in random_numbers:
        file.write(f"{number}\n")

print("Dataset of 100,000,000 arbitrary numbers has been generated and saved to numbers.txt")

Dataset of 100,000,000 arbitrary numbers has been generated and saved to numbers.txt


**Check for numbers in the file**

In [2]:
with open("data/numbers.txt") as file:
    line_count = sum(1 for line in file)

print(f"Number of lines in the file: {line_count}")

Number of lines in the file: 10000000


**Create a Spark Session**

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PrimeNumberFinder") \
    .getOrCreate()


**Load the dataset**

In [4]:
numbers_rdd = spark.sparkContext.textFile("data/numbers.txt")

In [5]:
numbers_rdd.take(5)

['12457723', '6315970', '32660038', '50349225', '3905249']

In [6]:
# Convert the numbers in RDD to integers
numbers_rdd = numbers_rdd.map(lambda x: int(x))

**Define a function to check if a number is prime**

In [7]:
def is_prime(n):
    if n <= 1:
        return False
    if n == 2:
        return True
    if n % 2 == 0:
        return False
    max_divisor = int(n**0.5) + 1
    for d in range(3, max_divisor, 2):
        if n % d == 0:
            return False
    return True


In [8]:
# Filter the prime numbers
prime_numbers_rdd = numbers_rdd.filter(is_prime)

In [9]:
prime_numbers_rdd.collect()

[55468241,
 46762817,
 25528619,
 21885559,
 95064113,
 51431329,
 44494279,
 3294817,
 95501983,
 47266157,
 16387433,
 86297383,
 82550899,
 57621973,
 52531849,
 26532787,
 528709,
 31753417,
 92809751,
 36375809,
 36060911,
 34658389,
 94492819,
 20044873,
 30985343,
 52351097,
 65245937,
 105019,
 88043843,
 75442781,
 77102897,
 36182719,
 48367633,
 67617229,
 42196757,
 49249367,
 43958029,
 548771,
 46681913,
 36825167,
 313543,
 45076517,
 76672279,
 87297871,
 48709963,
 42700291,
 64624349,
 73362827,
 28425389,
 76438541,
 92240341,
 24484987,
 46730351,
 21578731,
 4189183,
 13975981,
 93875861,
 50913869,
 99816581,
 42253727,
 34095169,
 4807081,
 73390349,
 32317147,
 44065517,
 89537933,
 25534351,
 9657707,
 66803053,
 69865811,
 36104129,
 94090453,
 1986689,
 9859601,
 49297973,
 687413,
 70086959,
 79658767,
 84456971,
 27699299,
 24922441,
 55169929,
 38465183,
 86107949,
 72143287,
 27647591,
 15075497,
 43101451,
 18233023,
 35843893,
 34144661,
 5899807,
 4867

In [11]:
prime_numbers_rdd.count()

576800

**Save result under a new text file**


In [12]:
prime_numbers_rdd.saveAsTextFile("data/prime_numbers.txt")

In [13]:
spark.stop()