In [6]:
import pyspark
from pyspark.sql import *
from pyspark import SparkContext, SparkConf

from pyspark.sql import *
from pyspark.sql.functions import *

import pandas as pd
import sqlite3

In [37]:
conf = SparkConf().setAppName("appName").setMaster("local")
sc = SparkContext(conf=conf)

spark = SparkSession.builder.getOrCreate()


In [88]:

import os.path
from os import listdir
from os.path import isfile, join

import subprocess

files = [
	"people.sql", 
	"cylons.sql", 
	"colonies.sql", 
	"person_kills.sql"
]

content = ""

for filename in files:
	filename = "../sql/" + filename
	if not os.path.isfile(filename):
		print('File does not exist: ' + filename)
	else:
	    with open(filename) as f:
	        content = content + f.read()

f = open("../sql/generated/simple.sql", "w")
f.write(content)
f.close()

db_command_file = "../generate_db.sh"
db_command = "sqlite3 ../simple.db < ../sql/generated/simple.sql"

with open(db_command_file, "w") as file:
    file.write(db_command)

try:
    result = subprocess.run(["bash", db_command_file], check=True, text=True, capture_output=True)
    print("Shell script output:", result.stdout)
except subprocess.CalledProcessError as e:
    print("Error occurred while running the script:", e)


Error occurred while running the script: Command '['bash', '../generate_db.sh']' returned non-zero exit status 1.


In [80]:

conn = sqlite3.connect("../simple.db")

In [82]:

df_people = pd.read_sql_query("SELECT * FROM people", conn, dtype={"home_colony_id": "Int64"})
df_colonies = pd.read_sql_query("SELECT * FROM colonies", conn)
df_cylons = pd.read_sql_query("SELECT * FROM cylons", conn)
df_person_kills = pd.read_sql_query("SELECT * FROM person_kills", conn)

print("type: ")
print(df_people["home_colony_id"].dtype)



print("df_people")
print(df_people.head())
print()
print("df_colonies")
print(df_colonies.head())
print()
print("df_cylons")
print(df_cylons.head())
print()
print("df_person_kills")
print(df_person_kills.head())

df_people.to_csv("people.csv", index=False)
df_colonies.to_csv("colonies.csv", index=False)


type: 
Int64
df_people
   id first_name last_name salutation call_sign  home_colony_id
0   1        Lee     Adama        Sir    Apollo               4
1   2    William     Adama        Sir    Husker               4
2   3       Kara    Thrace        Sir  Starbuck               4
3   4      Gaius    Baltar        Dr.      None               1
4   5      Laura    Roslin        Ms.      None               4

df_colonies
   id      name
0   1   Aerilon
1   2   Aquaria
2   3  Canceron
3   4   Caprica
4   5   Gemenon

df_cylons
   id  model_number       type gender
0   1           NaN  IL-series   None
1   2           NaN  centurion   None
2   3           NaN     raider   None
3   4           1.0   humanoid    man
4   5           2.0   humanoid    man

df_person_kills
Empty DataFrame
Columns: [person_killer_id, person_victim_id, cylon_victim_id]
Index: []


In [None]:

rdd_people = sc.parallelize(df_people.values.tolist())

rdd_people.collect()

In [51]:
bsg_people = spark.read.csv("people.csv", header=True)
bsg_colonies = spark.read.csv("colonies.csv", header=True)

bsg_data = bsg_people.join(bsg_colonies, bsg_people.home_colony_id == bsg_colonies.id)

bsg_data.show()

+---+----------+---------+----------+---------+--------------+---+----------+
| id|first_name|last_name|salutation|call_sign|home_colony_id| id|      name|
+---+----------+---------+----------+---------+--------------+---+----------+
|  1|       Lee|    Adama|       Sir|   Apollo|             4|  4|   Caprica|
|  2|   William|    Adama|       Sir|   Husker|             4|  4|   Caprica|
|  3|      Kara|   Thrace|       Sir| Starbuck|             4|  4|   Caprica|
|  4|     Gaius|   Baltar|       Dr.|     NULL|             1|  1|   Aerilon|
|  5|     Laura|   Roslin|       Ms.|     NULL|             4|  4|   Caprica|
|  9|      Saul|     Tigh|       Sir|     NULL|            13| 13|     Earth|
| 11|     Billy|  Keikeya|       Mr.|     NULL|             8|  8|     Picon|
| 12|     Galen|    Tyrol|       Sir|     NULL|            13| 13|     Earth|
| 13| Anastasia|   Dualla|       Sir|     NULL|             9|  9|Sagittaron|
| 16|      Karl|  Agathon|       Sir|     Helo|             4|  

In [52]:
colony_counts = bsg_data.groupBy("home_colony_id").count().sort(desc("count"))

colony_counts.show()

+--------------+-----+
|home_colony_id|count|
+--------------+-----+
|             4|    5|
|            13|    3|
|             8|    1|
|             9|    1|
|             1|    1|
+--------------+-----+



In [36]:
conn.close()
sc.stop()