In [35]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Generate random data
data = [(random.randint(0, 9999), random.randint(1, 2)) for _ in range(11000)]

# Define schema
schema = StructType([
    StructField("key", IntegerType(), True),
    StructField("value", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Repartition DataFrame
df = df.repartition("key")

# Show DataFrame
df.show()

skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()

# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(F.col("count").desc())

# Display the skewness DataFrame
skew_df.show()

+----+-----+
| key|value|
+----+-----+
|2142|    1|
|2122|    1|
|9427|    1|
|9465|    1|
|1591|    1|
|1342|    2|
|2366|    1|
|2866|    1|
|1342|    2|
|5803|    2|
| 833|    1|
|4935|    1|
|9427|    2|
|1645|    1|
| 148|    2|
| 148|    2|
|2142|    2|
|1580|    2|
| 496|    1|
|7880|    1|
+----+-----+
only showing top 20 rows



In [37]:
from pyspark.sql.functions import skewness

skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()


+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|3.536978925942362E-4|-7.27272775356920...|
+--------------------+--------------------+



In [38]:
# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(F.col("count").desc())

# Display the skewness DataFrame
skew_df.show()



+-----------+-----+
|partitionId|count|
+-----------+-----+
|         97|   92|
|         21|   90|
|        117|   76|
|         77|   75|
|        162|   75|
|        106|   75|
|         35|   74|
|        114|   74|
|         69|   73|
|        112|   72|
|        103|   71|
|        147|   71|
|        122|   70|
|        175|   69|
|         25|   69|
|        115|   69|
|         18|   69|
|          4|   69|
|        150|   68|
|        198|   68|
+-----------+-----+
only showing top 20 rows



                                                                                

In [39]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import skewness, count, spark_partition_id, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Generate random data
data = [(random.randint(0, 9999), random.randint(1, 2)) for _ in range(11000)]

# Define schema
schema = StructType([
    StructField("key", IntegerType(), True),
    StructField("value", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Repartition DataFrame
df = df.repartition("key")

# Show DataFrame
df.show()

# Compute skewness for each column
skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()

# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(col("count").desc())

# Display the skewness DataFrame
skew_df.show()


+----+-----+
| key|value|
+----+-----+
|1088|    2|
|7982|    2|
|3997|    2|
|8389|    2|
|6357|    1|
|5518|    2|
|9376|    2|
|7833|    2|
|7253|    1|
|8638|    1|
|4101|    2|
|9852|    2|
|8086|    1|
|6336|    2|
|2659|    2|
|4818|    2|
| 463|    1|
|2122|    2|
|5803|    1|
|5803|    2|
+----+-----+
only showing top 20 rows

+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|-0.01051545026188...|-3.63636369646860...|
+--------------------+--------------------+





+-----------+-----+
|partitionId|count|
+-----------+-----+
|         97|   88|
|        113|   86|
|         21|   86|
|        150|   84|
|         13|   80|
|        176|   77|
|        117|   77|
|        162|   76|
|        100|   76|
|        143|   74|
|         86|   74|
|        102|   74|
|        190|   73|
|        163|   72|
|         23|   72|
|        110|   72|
|         51|   71|
|         83|   71|
|        121|   70|
|        171|   70|
+-----------+-----+
only showing top 20 rows



                                                                                

In [40]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Skewness Example") \
    .getOrCreate()

# Create a DataFrame with skewed data
data = [("A",), ("A",), ("A",), ("B",), ("B",), ("C",)]
columns = ["col"]
df = spark.createDataFrame(data, columns)

# Replicate data in the skewed column to introduce skewness
skewed_data = []
for i in range(1, 1000):
    skewed_data.extend(data)
skewed_df = spark.createDataFrame(skewed_data, columns)

# Union the original DataFrame with the skewed DataFrame
df = df.union(skewed_df)

# Add a column to the DataFrame with the partition ID
df = df.withColumn("partitionId", F.spark_partition_id())

# Compute the number of records in each partition and sort the results
skew_df = df.groupBy("partitionId").count().orderBy("count", ascending=False)

# Display the results
skew_df.show()

# Stop the SparkSession
spark.stop()


+-----------+-----+
|partitionId|count|
+-----------+-----+
|         15|  751|
|         10|  749|
|         13|  749|
|          9|  749|
|          8|  749|
|         11|  749|
|         12|  749|
|         14|  749|
|          3|    1|
|          5|    1|
|          1|    1|
|          6|    1|
|          7|    1|
|          2|    1|
+-----------+-----+



+---+
|col|
+---+
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
+---+



+---+
|col|
+---+
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
+---+
only showing top 20 rows



+---+-----------+
|col|partitionId|
+---+-----------+
|  A|          0|
|  A|          0|
|  A|          0|
|  B|          0|
|  B|          0|
|  C|          0|
|  A|          1|
|  A|          1|
|  A|          1|
|  B|          1|
|  B|          1|
|  C|          1|
|  A|          1|
|  A|          1|
|  A|          1|
|  B|          1|
|  B|          1|
|  C|          1|
|  A|          1|
|  A|          1|
+---+-----------+
only showing top 20 rows



In [84]:
# Compute the number of records in each partition and sort the results
df = df.groupBy("partitionId").count().orderBy("partitionId", ascending=False)
df.show()



+-----------+-----+
|partitionId|count|
+-----------+-----+
|          1| 5994|
|          0|    6|
+-----------+-----+



                                                                                

In [85]:
re_df = df1.repartition(10)
re_df = re_df.withColumn("part", spark_partition_id())
# re_df = re_df.groupBy("partitionId").count().orderBy("partitionId", ascending=False)


re_df = re_df.groupBy("part").count().orderBy("part",ascending=True)
re_df.show()
re_df.rdd.getNumPartitions()

                                                                                

+----+-----+
|part|count|
+----+-----+
|   0|  599|
|   1|  600|
|   2|  600|
|   3|  600|
|   4|  600|
|   5|  600|
|   6|  601|
|   7|  600|
|   8|  600|
|   9|  600|
+----+-----+



                                                                                

10

In [1]:
def max_profit(prices):
    if not prices:
        return 0
    
    min_price = prices[0]
    max_profit = 0
    
    for price in prices:
        min_price = min(min_price, price)
        max_profit = max(max_profit, price - min_price)
    
    return max_profit

# Example usage:
prices = [7, 1, 5, 3, 6, 4]
print(max_profit(prices))  # Output: 5 (Buy at price 1 and sell at price 6 for a profit of 6 - 1 = 5)

5


                                                                                

In [44]:

# df4.show()

                                                                                

200


                                                                                

200


                                                                                

200

                                                                                

200




200


                                                                                

                                                                                

                                                                                

+-----------+-----+
|partitionId|count|
+-----------+-----+
|          1| 5994|
|          0|    6|
+-----------+-----+

