In [35]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Generate random data
data = [(random.randint(0, 9999), random.randint(1, 2)) for _ in range(11000)]

# Define schema
schema = StructType([
    StructField("key", IntegerType(), True),
    StructField("value", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Repartition DataFrame
df = df.repartition("key")

# Show DataFrame
df.show()

skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()

# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(F.col("count").desc())

# Display the skewness DataFrame
skew_df.show()

+----+-----+
| key|value|
+----+-----+
|2142|    1|
|2122|    1|
|9427|    1|
|9465|    1|
|1591|    1|
|1342|    2|
|2366|    1|
|2866|    1|
|1342|    2|
|5803|    2|
| 833|    1|
|4935|    1|
|9427|    2|
|1645|    1|
| 148|    2|
| 148|    2|
|2142|    2|
|1580|    2|
| 496|    1|
|7880|    1|
+----+-----+
only showing top 20 rows



In [37]:
from pyspark.sql.functions import skewness

skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()


+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|3.536978925942362E-4|-7.27272775356920...|
+--------------------+--------------------+



In [38]:
# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(F.col("count").desc())

# Display the skewness DataFrame
skew_df.show()



+-----------+-----+
|partitionId|count|
+-----------+-----+
|         97|   92|
|         21|   90|
|        117|   76|
|         77|   75|
|        162|   75|
|        106|   75|
|         35|   74|
|        114|   74|
|         69|   73|
|        112|   72|
|        103|   71|
|        147|   71|
|        122|   70|
|        175|   69|
|         25|   69|
|        115|   69|
|         18|   69|
|          4|   69|
|        150|   68|
|        198|   68|
+-----------+-----+
only showing top 20 rows



                                                                                

In [39]:
import random
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import skewness, count, spark_partition_id, col

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Example") \
    .getOrCreate()

# Generate random data
data = [(random.randint(0, 9999), random.randint(1, 2)) for _ in range(11000)]

# Define schema
schema = StructType([
    StructField("key", IntegerType(), True),
    StructField("value", IntegerType(), True)
])

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Repartition DataFrame
df = df.repartition("key")

# Show DataFrame
df.show()

# Compute skewness for each column
skew_result = df.select([skewness(col).alias(col) for col in df.columns])
skew_result.show()

# Add partitionId column
df = df.withColumn("partitionId", spark_partition_id())

# Calculate count of keys per partition
skew_df = df.select("partitionId", "key").groupBy("partitionId").agg(count("key").alias("count")).orderBy(col("count").desc())

# Display the skewness DataFrame
skew_df.show()


+----+-----+
| key|value|
+----+-----+
|1088|    2|
|7982|    2|
|3997|    2|
|8389|    2|
|6357|    1|
|5518|    2|
|9376|    2|
|7833|    2|
|7253|    1|
|8638|    1|
|4101|    2|
|9852|    2|
|8086|    1|
|6336|    2|
|2659|    2|
|4818|    2|
| 463|    1|
|2122|    2|
|5803|    1|
|5803|    2|
+----+-----+
only showing top 20 rows

+--------------------+--------------------+
|                 key|               value|
+--------------------+--------------------+
|-0.01051545026188...|-3.63636369646860...|
+--------------------+--------------------+





+-----------+-----+
|partitionId|count|
+-----------+-----+
|         97|   88|
|        113|   86|
|         21|   86|
|        150|   84|
|         13|   80|
|        176|   77|
|        117|   77|
|        162|   76|
|        100|   76|
|        143|   74|
|         86|   74|
|        102|   74|
|        190|   73|
|        163|   72|
|         23|   72|
|        110|   72|
|         51|   71|
|         83|   71|
|        121|   70|
|        171|   70|
+-----------+-----+
only showing top 20 rows



                                                                                

In [40]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# Create a SparkSession
spark = SparkSession.builder \
    .appName("Skewness Example") \
    .getOrCreate()

# Create a DataFrame with skewed data
data = [("A",), ("A",), ("A",), ("B",), ("B",), ("C",)]
columns = ["col"]
df = spark.createDataFrame(data, columns)

# Replicate data in the skewed column to introduce skewness
skewed_data = []
for i in range(1, 1000):
    skewed_data.extend(data)
skewed_df = spark.createDataFrame(skewed_data, columns)

# Union the original DataFrame with the skewed DataFrame
df = df.union(skewed_df)

# Add a column to the DataFrame with the partition ID
df = df.withColumn("partitionId", F.spark_partition_id())

# Compute the number of records in each partition and sort the results
skew_df = df.groupBy("partitionId").count().orderBy("count", ascending=False)

# Display the results
skew_df.show()

# Stop the SparkSession
spark.stop()


+-----------+-----+
|partitionId|count|
+-----------+-----+
|         15|  751|
|         10|  749|
|         13|  749|
|          9|  749|
|          8|  749|
|         11|  749|
|         12|  749|
|         14|  749|
|          3|    1|
|          5|    1|
|          1|    1|
|          6|    1|
|          7|    1|
|          2|    1|
+-----------+-----+



+---+
|col|
+---+
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
+---+



+---+
|col|
+---+
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
|  A|
|  B|
|  B|
|  C|
|  A|
|  A|
+---+
only showing top 20 rows



+---+-----------+
|col|partitionId|
+---+-----------+
|  A|          0|
|  A|          0|
|  A|          0|
|  B|          0|
|  B|          0|
|  C|          0|
|  A|          1|
|  A|          1|
|  A|          1|
|  B|          1|
|  B|          1|
|  C|          1|
|  A|          1|
|  A|          1|
|  A|          1|
|  B|          1|
|  B|          1|
|  C|          1|
|  A|          1|
|  A|          1|
+---+-----------+
only showing top 20 rows



In [84]:
# Compute the number of records in each partition and sort the results
df = df.groupBy("partitionId").count().orderBy("partitionId", ascending=False)
df.show()



+-----------+-----+
|partitionId|count|
+-----------+-----+
|          1| 5994|
|          0|    6|
+-----------+-----+



                                                                                

In [85]:
re_df = df1.repartition(10)
re_df = re_df.withColumn("part", spark_partition_id())
# re_df = re_df.groupBy("partitionId").count().orderBy("partitionId", ascending=False)


re_df = re_df.groupBy("part").count().orderBy("part",ascending=True)
re_df.show()
re_df.rdd.getNumPartitions()

                                                                                

+----+-----+
|part|count|
+----+-----+
|   0|  599|
|   1|  600|
|   2|  600|
|   3|  600|
|   4|  600|
|   5|  600|
|   6|  601|
|   7|  600|
|   8|  600|
|   9|  600|
+----+-----+



                                                                                

10

In [1]:
def max_profit(prices):
    if not prices:
        return 0
    
    min_price = prices[0]
    max_profit = 0
    
    for price in prices:
        min_price = min(min_price, price)
        max_profit = max(max_profit, price - min_price)
    
    return max_profit

# Example usage:
prices = [7, 1, 5, 3, 6, 4]
print(max_profit(prices))  # Output: 5 (Buy at price 1 and sell at price 6 for a profit of 6 - 1 = 5)

5


AnalysisException: Path does not exist: file:/home/chaitanya/GIT/P-SPARK/notebook/11-sc2/data.csv

In [44]:

# df4.show()

                                                                                

200


                                                                                

200


                                                                                

200

                                                                                

200




200


                                                                                

                                                                                

In [4]:
x = 10  # Global variable

def my_function():
    x = 20  # This creates a new local variable 'x'
    print("Inside function:", x)

my_function()
print("Outside function:", x)  # Output: Outside function: 10


Inside function: 20
Outside function: 10


In [5]:
x = 10  # Global variable

def my_function():
    global x  # Declare 'x' as a global variable
    x = 20  # Modifying the global variable 'x'
    print("Inside function:", x)

my_function()
print("Outside function:", x)  # Output: Outside function: 20


Inside function: 20
Outside function: 20


In [3]:

  
  from pyspark.sql import SparkSession
  spark = SparkSession.builder\
                      .master("local") \
                      .appName("lead_lag") \
                      .getOrCreate()
  product_data = [
  (1,"iphone","01-01-2023",1500000),
  (2,"samsung","01-01-2023",1100000),
  (3,"oneplus","01-01-2023",1100000),
  (1,"iphone","01-02-2023",1300000),
  (2,"samsung","01-02-2023",1120000),
  (3,"oneplus","01-02-2023",1120000),
  (1,"iphone","01-03-2023",1600000),
  (2,"samsung","01-03-2023",1080000),
  (3,"oneplus","01-03-2023",1160000),
  (1,"iphone","01-04-2023",1700000),
  (2,"samsung","01-04-2023",1800000),
  (3,"oneplus","01-04-2023",1170000),
  (1,"iphone","01-05-2023",1200000),
  (2,"samsung","01-05-2023",980000),
  (3,"oneplus","01-05-2023",1175000),
  (1,"iphone","01-06-2023",1100000),
  (2,"samsung","01-06-2023",1100000),
  (3,"oneplus","01-06-2023",1200000)
  ]
  
  
  product_schema=['id','name','date','sell']
  
  p_df = spark.createDataFrame(data=product_data,schema=product_schema)
  p_df.show()
  
  from pyspark.sql.functions import *
  from pyspark.sql.types import *
  from pyspark.sql.window import Window
  
  window = Window.partitionBy("name").orderBy("date")
  
  df = p_df.withColumn("previous_ms", lag(p_df["sell"],1).over(window))
  df.show()
  
  df.withColumn("loss_or_gain",((col("sell") - col("previous_ms"))*100/df["sell"])).show()
  
  df.withColumn("loss_or_gain",round(((col("sell") - col("previous_ms"))*100/df["sell"]),2)).show() # showing round data

+---+-------+----------+-------+
| id|   name|      date|   sell|
+---+-------+----------+-------+
|  1| iphone|01-01-2023|1500000|
|  2|samsung|01-01-2023|1100000|
|  3|oneplus|01-01-2023|1100000|
|  1| iphone|01-02-2023|1300000|
|  2|samsung|01-02-2023|1120000|
|  3|oneplus|01-02-2023|1120000|
|  1| iphone|01-03-2023|1600000|
|  2|samsung|01-03-2023|1080000|
|  3|oneplus|01-03-2023|1160000|
|  1| iphone|01-04-2023|1700000|
|  2|samsung|01-04-2023|1800000|
|  3|oneplus|01-04-2023|1170000|
|  1| iphone|01-05-2023|1200000|
|  2|samsung|01-05-2023| 980000|
|  3|oneplus|01-05-2023|1175000|
|  1| iphone|01-06-2023|1100000|
|  2|samsung|01-06-2023|1100000|
|  3|oneplus|01-06-2023|1200000|
+---+-------+----------+-------+



                                                                                

+---+-------+----------+-------+-----------+
| id|   name|      date|   sell|previous_ms|
+---+-------+----------+-------+-----------+
|  1| iphone|01-01-2023|1500000|       null|
|  1| iphone|01-02-2023|1300000|    1500000|
|  1| iphone|01-03-2023|1600000|    1300000|
|  1| iphone|01-04-2023|1700000|    1600000|
|  1| iphone|01-05-2023|1200000|    1700000|
|  1| iphone|01-06-2023|1100000|    1200000|
|  2|samsung|01-01-2023|1100000|       null|
|  2|samsung|01-02-2023|1120000|    1100000|
|  2|samsung|01-03-2023|1080000|    1120000|
|  2|samsung|01-04-2023|1800000|    1080000|
|  2|samsung|01-05-2023| 980000|    1800000|
|  2|samsung|01-06-2023|1100000|     980000|
|  3|oneplus|01-01-2023|1100000|       null|
|  3|oneplus|01-02-2023|1120000|    1100000|
|  3|oneplus|01-03-2023|1160000|    1120000|
|  3|oneplus|01-04-2023|1170000|    1160000|
|  3|oneplus|01-05-2023|1175000|    1170000|
|  3|oneplus|01-06-2023|1200000|    1175000|
+---+-------+----------+-------+-----------+



                                                                                

+---+-------+----------+-------+-----------+-------------------+
| id|   name|      date|   sell|previous_ms|       loss_or_gain|
+---+-------+----------+-------+-----------+-------------------+
|  1| iphone|01-01-2023|1500000|       null|               null|
|  1| iphone|01-02-2023|1300000|    1500000|-15.384615384615385|
|  1| iphone|01-03-2023|1600000|    1300000|              18.75|
|  1| iphone|01-04-2023|1700000|    1600000|  5.882352941176471|
|  1| iphone|01-05-2023|1200000|    1700000|-41.666666666666664|
|  1| iphone|01-06-2023|1100000|    1200000| -9.090909090909092|
|  2|samsung|01-01-2023|1100000|       null|               null|
|  2|samsung|01-02-2023|1120000|    1100000| 1.7857142857142858|
|  2|samsung|01-03-2023|1080000|    1120000|-3.7037037037037037|
|  2|samsung|01-04-2023|1800000|    1080000|               40.0|
|  2|samsung|01-05-2023| 980000|    1800000|  -83.6734693877551|
|  2|samsung|01-06-2023|1100000|     980000| 10.909090909090908|
|  3|oneplus|01-01-2023|1

                                                                                

+---+-------+----------+-------+-----------+------------+
| id|   name|      date|   sell|previous_ms|loss_or_gain|
+---+-------+----------+-------+-----------+------------+
|  1| iphone|01-01-2023|1500000|       null|        null|
|  1| iphone|01-02-2023|1300000|    1500000|      -15.38|
|  1| iphone|01-03-2023|1600000|    1300000|       18.75|
|  1| iphone|01-04-2023|1700000|    1600000|        5.88|
|  1| iphone|01-05-2023|1200000|    1700000|      -41.67|
|  1| iphone|01-06-2023|1100000|    1200000|       -9.09|
|  2|samsung|01-01-2023|1100000|       null|        null|
|  2|samsung|01-02-2023|1120000|    1100000|        1.79|
|  2|samsung|01-03-2023|1080000|    1120000|        -3.7|
|  2|samsung|01-04-2023|1800000|    1080000|        40.0|
|  2|samsung|01-05-2023| 980000|    1800000|      -83.67|
|  2|samsung|01-06-2023|1100000|     980000|       10.91|
|  3|oneplus|01-01-2023|1100000|       null|        null|
|  3|oneplus|01-02-2023|1120000|    1100000|        1.79|
|  3|oneplus|0

                                                                                

In [2]:
l=["ab", "cd", "ef", "gh", "ij", "kl"]

m=2

for i in range(0,6,m):

    print("".join(l[i:i+m]), end=" ")

abcd efgh ijkl 

In [4]:
class One:

    def __init__(self):

        self.add(5)

        print(self.i)

    def add(self,i):

        self.i=4+i;

class Two (One):

    def __init__(self):

        super().__init__()

    def add(self,i):

        self.i=2+i;

work=Two()

7


In [5]:
def fun():

    a = '5'

    b = '2.5'

    def inc():

        nonlocal a

        nonlocal b

        a = int(a) + float(b)

        return a

    return inc

c = fun()

print(c())

7.5


In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import * 
from pyspark.sql.window  import Window
from pyspark.sql.types import *

path = "/home/chaitanya/Downloads/cord19_medline_assocs_v2.3.tsv"

spark = SparkSession.builder\
                    .appName("tsv")\
                    .master("local")\
                    .getOrCreate()
                    
df = spark.read.format("csv")\
                .option("header",True)\
                .option("mode","permissive")\
                .option("delimiter", "\t")\
                .load(path)
# df.show()
# df.count()

null_df = df.filter(col("src_ent").isNull() | col("src_type").isNull() | col("target_ent").isNull() | col("score").isNull() | col("debug").isNull())
# null_df.show()
# print(null_df.count())
s = df.rdd.getNumPartitions()
print(s, type(s))
skew_df = df.withColumn("part", spark_partition_id())
skew_calculate = skew_df.groupBy("part").count()
skew_calculate.show()
# skew_df.show()
# print(input("enter the value"))
# spark.stop()



5 <class 'int'>


                                                                                

+----+-------+
|part|  count|
+----+-------+
|   1|1793529|
|   3|1780898|
|   4| 513060|
|   2|1842208|
|   0|1884789|
+----+-------+



In [40]:
df1 = df.coalesce(3)
skew_df = df1.withColumn("part", spark_partition_id())
skew_calculate = skew_df.groupBy("part").count()
skew_calculate.show()
spark.stop()

                                                                                

+----+-------+
|part|  count|
+----+-------+
|   1|3635737|
|   2|2293958|
|   0|1884789|
+----+-------+

