In [4]:
import os
os.environ['SPARK_HOME'] = "/home/user/.application-data/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = 'python'

import numpy as np
import pandas as pd
import pyspark.pandas as ps

### Migration from **Pandas** to **Pandas API on Spark**

In [5]:
# creates a pandas series
pd_ser = pd.Series([1, 3, 5, np.nan, 6, 8])
display(pd_ser)

# creates a pandas-on-spark series
ps_ser = ps.Series([1, 3, 5, np.nan, 6, 8])
display(ps_ser)

# creates a pndas-on-spark series by passing a pandas series
ps_ser = ps.Series(pd_ser)
display(ps_ser)

ps_ser = ps.from_pandas(pd_ser)
display(ps_ser)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# pandas methods work in pandas-on-spark
ps_ser.sort_index()

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
length = 10
# creates a dataframe in pandas
pd_df = pd.DataFrame({'A': np.random.rand(length), 'B': np.random.rand(length)}, index=np.arange(length))
display(pd_df)

# creates pandas-on-spark dataframe
ps_df = ps.DataFrame({'A': np.random.rand(length), 'B': np.random.rand(length)}, index=np.arange(length))
display(ps_df)

# creates pandas-on-spark df by passing a pandas df
ps_df = ps.DataFrame(pd_df)
display(ps_df)

ps_df = ps.from_pandas(pd_df)
display(ps_df)

Unnamed: 0,A,B
0,0.423531,0.783631
1,0.340575,0.553951
2,0.987265,0.335101
3,0.132176,0.307459
4,0.456404,0.438583
5,0.55924,0.058604
6,0.556088,0.642942
7,0.91878,0.557511
8,0.906561,0.211067
9,0.395394,0.651483


Unnamed: 0,A,B
0,0.481022,0.596343
1,0.462077,0.057934
2,0.049212,0.678762
3,0.471712,0.742645
4,0.549631,0.755667
5,0.024443,0.262763
6,0.252163,0.289928
7,0.462819,0.000982
8,0.66297,0.931346
9,0.982996,0.391246


Unnamed: 0,A,B
0,0.423531,0.783631
1,0.340575,0.553951
2,0.987265,0.335101
3,0.132176,0.307459
4,0.456404,0.438583
5,0.55924,0.058604
6,0.556088,0.642942
7,0.91878,0.557511
8,0.906561,0.211067
9,0.395394,0.651483


Unnamed: 0,A,B
0,0.423531,0.783631
1,0.340575,0.553951
2,0.987265,0.335101
3,0.132176,0.307459
4,0.456404,0.438583
5,0.55924,0.058604
6,0.556088,0.642942
7,0.91878,0.557511
8,0.906561,0.211067
9,0.395394,0.651483


In [8]:
ps_df.sort_values(by='A')

Unnamed: 0,A,B
3,0.132176,0.307459
1,0.340575,0.553951
9,0.395394,0.651483
0,0.423531,0.783631
4,0.456404,0.438583
6,0.556088,0.642942
5,0.55924,0.058604
8,0.906561,0.211067
7,0.91878,0.557511
2,0.987265,0.335101


In [9]:
ps_df.head(3)

Unnamed: 0,A,B
0,0.423531,0.783631
1,0.340575,0.553951
2,0.987265,0.335101


In [10]:
ps_df.describe()

                                                                                

Unnamed: 0,A,B
count,10.0,10.0
mean,0.567601,0.454033
std,0.282539,0.224911
min,0.132176,0.058604
25%,0.395394,0.307459
50%,0.456404,0.438583
75%,0.906561,0.642942
max,0.987265,0.783631


In [11]:
ps_df.apply(np.cumsum)



Unnamed: 0,A,B
0,0.423531,0.783631
1,0.764106,1.337582
2,1.751371,1.672682
3,1.883547,1.980141
4,2.339951,2.418724
5,2.899191,2.477328
6,3.455279,3.12027
7,4.374059,3.677781
8,5.28062,3.888848
9,5.676014,4.54033


In [12]:
ps_df.apply(lambda x: x**2)



Unnamed: 0,A,B
0,0.179378,0.614077
1,0.115991,0.306861
2,0.974692,0.112292
3,0.017471,0.094531
4,0.208304,0.192355
5,0.31275,0.003434
6,0.309234,0.413374
7,0.844157,0.310819
8,0.821852,0.044549
9,0.156337,0.42443
