In [1]:
import os
import findspark

In [2]:
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import udf, format_number

In [4]:
spark = SparkSession.builder \
    .appName("UsStock") \
    .getOrCreate()

In [5]:
import os
root_path = os.getcwd()
# print(root_path)
us_stock_overview_csv = 'history\\20250525154117.csv'
us_stock_overview_csv_path = os.path.join(root_path, us_stock_overview_csv)
# print(us_stock_overview_csv_path)

In [6]:
#join chinese/english industry
chi_stock_overview_csv = 'history\\20250525163420.csv'
chi_stock_overview_csv_path = os.path.join(root_path, chi_stock_overview_csv)
schema = StructType([
    StructField("No", IntegerType(), True),
    StructField("Symbol", StringType(), True),
    StructField("Chi_Industry", StringType(), True)
])
chinese_df = spark.read.csv(chi_stock_overview_csv_path, header=True, schema=schema)
chinese_df.printSchema()
chinese_df.createOrReplaceTempView("CHINESE_STOCK_DATA")

root
 |-- No: integer (nullable = true)
 |-- Symbol: string (nullable = true)
 |-- Chi_Industry: string (nullable = true)



In [7]:
def percentage_to_float(percentage):
     return (float(percentage.strip('%')) / 100)

percentage_to_float_udf = udf(percentage_to_float, FloatType())
spark.udf.register("percentage_to_float_udf", percentage_to_float_udf)

<pyspark.sql.udf.UserDefinedFunction at 0x23630084e50>

In [8]:
original_df = spark.read.csv(us_stock_overview_csv_path, header=True, inferSchema=True)
original_df.createOrReplaceTempView("ORIGINAL_STOCK_DATA")
filtered_df_transformed = spark.sql("""SELECT T1.Symbol, percentage_to_float_udf(T1.`% Chg`) AS Chg, T1.`Market Cap`, T1.Industry, T2.Chi_Industry FROM ORIGINAL_STOCK_DATA T1 INNER JOIN CHINESE_STOCK_DATA T2 ON T1.Symbol = T2.Symbol WHERE T1.`Market Cap` > 100000000 and T1.Industry != '-'""")
filtered_df_transformed.show()

+------+------+-----------------+--------------------+--------------------+
|Symbol|   Chg|       Market Cap|            Industry|        Chi_Industry|
+------+------+-----------------+--------------------+--------------------+
|  RAPT|0.4334|   1.6271508378E8|       Biotechnology|            生物技術|
|  LTBR|0.4259|     3.42580032E8|Electrical Equipm...|      電氣設備及零件|
|  DOUG|0.3551|    2.573397302E8|Real Estate Services|          房地產服務|
|  MRUS|0.3255|  3.81643658064E9|       Biotechnology|            生物技術|
|   NNE|0.3007|  1.34479564978E9|Specialty Industr...|        專用工業機械|
|   RGC|0.2684|     7.28720496E9|Drug Manufacturer...|專業與通用藥品製造商|
|   UEC|  0.25|   2.7653188974E9|             Uranium|                  鈾|
|  BSGM|0.2424|   1.4411550912E8|     Medical Devices|            醫療設備|
|  ALTS|0.2409|    1.555210206E8|Software - Applic...|            應用軟件|
|    EU|0.2313|   3.6693472357E8|             Uranium|                  鈾|
|  OKLO|0.2304|  6.80295807513E9|Utilities - Regul...|    

In [9]:
filtered_df_transformed.printSchema()

root
 |-- Symbol: string (nullable = true)
 |-- Chg: float (nullable = true)
 |-- Market Cap: double (nullable = true)
 |-- Industry: string (nullable = true)
 |-- Chi_Industry: string (nullable = true)



In [10]:
filtered_df_transformed.createOrReplaceTempView("TRANSFORMED_STOCK_DATA")
avg_industry_with_best_contribution_df = spark.sql("""SELECT T1.Industry, T1.Chi_Industry, T1.`Total Industry Change Rate`, T1.`Total Stock`, T1.`Best Contribution`, T2.Symbol AS `Best Stock` FROM (SELECT Industry, Chi_Industry, ROUND(AVG(Chg),4) AS `Total Industry Change Rate`, COUNT(Industry) AS `Total Stock`, MAX(Chg) AS `Best Contribution` FROM TRANSFORMED_STOCK_DATA GROUP BY Industry, Chi_Industry) T1 INNER JOIN TRANSFORMED_STOCK_DATA T2 ON T1.Industry = T2.Industry and T1.`Best Contribution` = T2.Chg ORDER BY T1.`Total Industry Change Rate` DESC""")
avg_industry_with_best_contribution_df.show()

+--------------------+------------------+--------------------------+-----------+-----------------+----------+
|            Industry|      Chi_Industry|Total Industry Change Rate|Total Stock|Best Contribution|Best Stock|
+--------------------+------------------+--------------------------+-----------+-----------------+----------+
|             Uranium|                鈾|                    0.1789|         10|             0.25|       UEC|
|Electrical Equipm...|    電氣設備及零件|                    0.0441|         29|           0.4259|      LTBR|
|Other Precious Me...|其他工業金屬與採礦|                    0.0405|         13|           0.1053|      ITRG|
|              Copper|                銅|                    0.0319|          7|           0.0389|       HBM|
|        Broadcasting|              廣播|                     0.027|         10|            0.214|     GTN.A|
|                Gold|              黃金|                    0.0256|         42|           0.0861|       IDR|
|Real Estate - Dev...|        房地