In [1]:
import pandas as pd
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

In [2]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
sc

In [17]:
data = [['tom', 10], ['nick', 15], ['juil', 14], ['nick', 20], ['david', 15], ['tom', 40]]
df = pd.DataFrame(data, columns = ['Name', 'Age'])

In [18]:
pyspark_df = spark.createDataFrame(df)

In [19]:
type(df)

pandas.core.frame.DataFrame

In [20]:
type(pyspark_df)

pyspark.sql.dataframe.DataFrame

In [21]:
pyspark_df.show()

+-----+---+
| Name|Age|
+-----+---+
|  tom| 10|
| nick| 15|
| juil| 14|
| nick| 20|
|david| 15|
|  tom| 40|
+-----+---+



In [22]:
pyspark_df.count()

6

In [23]:
pyspark_df.dtypes

[('Name', 'string'), ('Age', 'bigint')]

In [24]:
from pyspark.sql.functions import lit

pyspark_df = pyspark_df.withColumn('new_col_1', lit(1))

In [25]:
pyspark_df.show()

+-----+---+---------+
| Name|Age|new_col_1|
+-----+---+---------+
|  tom| 10|        1|
| nick| 15|        1|
| juil| 14|        1|
| nick| 20|        1|
|david| 15|        1|
|  tom| 40|        1|
+-----+---+---------+



In [26]:
pyspark_df = pyspark_df.withColumn('new_col_2', pyspark_df.Age+1)

In [27]:
pyspark_df.show()

+-----+---+---------+---------+
| Name|Age|new_col_1|new_col_2|
+-----+---+---------+---------+
|  tom| 10|        1|       11|
| nick| 15|        1|       16|
| juil| 14|        1|       15|
| nick| 20|        1|       21|
|david| 15|        1|       16|
|  tom| 40|        1|       41|
+-----+---+---------+---------+



In [28]:
pyspark_df.filter(pyspark_df.Name == 'nick').show()

+----+---+---------+---------+
|Name|Age|new_col_1|new_col_2|
+----+---+---------+---------+
|nick| 15|        1|       16|
|nick| 20|        1|       21|
+----+---+---------+---------+



In [29]:
pyspark_df.filter(pyspark_df.Name == 'nick').collect()[0][1]

15

In [30]:
pyspark_df.show()

+-----+---+---------+---------+
| Name|Age|new_col_1|new_col_2|
+-----+---+---------+---------+
|  tom| 10|        1|       11|
| nick| 15|        1|       16|
| juil| 14|        1|       15|
| nick| 20|        1|       21|
|david| 15|        1|       16|
|  tom| 40|        1|       41|
+-----+---+---------+---------+



In [31]:
pyspark_df.select('Name').distinct().show()

+-----+
| Name|
+-----+
|david|
| nick|
|  tom|
| juil|
+-----+



In [33]:
from pyspark.sql.functions import *

pyspark_df.groupby('Name').agg(mean('Age')).show()

+-----+--------+
| Name|avg(Age)|
+-----+--------+
|david|    15.0|
| nick|    17.5|
|  tom|    25.0|
| juil|    14.0|
+-----+--------+



In [35]:
data = [['nick', 80], ['juil', 70], ['david', 90], ['tom', 60]]
score = pd.DataFrame(data, columns = ['Name', 'Score'])

In [41]:
score = pd.DataFrame(score)
score

Unnamed: 0,Name,Score
0,nick,80
1,juil,70
2,david,90
3,tom,60


In [42]:
data = [['nick', 80], ['juil', 70], ['david', 90], ['tom', 60]]
score = pd.DataFrame(data, columns = ['Name', 'Score'])
pyspark_score = spark.createDataFrame(score)

In [44]:
pyspark_score.show()

+-----+-----+
| Name|Score|
+-----+-----+
| nick|   80|
| juil|   70|
|david|   90|
|  tom|   60|
+-----+-----+



In [45]:
pyspark_df.join(pyspark_score, 'Name', how='left').show()

+-----+---+---------+---------+-----+
| Name|Age|new_col_1|new_col_2|Score|
+-----+---+---------+---------+-----+
|david| 15|        1|       16|   90|
| nick| 15|        1|       16|   80|
| nick| 20|        1|       21|   80|
|  tom| 10|        1|       11|   60|
|  tom| 40|        1|       41|   60|
| juil| 14|        1|       15|   70|
+-----+---+---------+---------+-----+

