In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext
import pyspark
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window
import time
import numpy as np

In [2]:
spark = SparkSession\
    .builder\
    .appName("example-spark").getOrCreate()
    #.config("spark.sql.crossJoin.enabled","true")\

In [4]:
mylist = [[1,2,3], [4,5,6], [7,8,9]]
col = ['a','b','c']

In [6]:
data = spark.createDataFrame(mylist, col).show()

+---+---+---+
|  a|  b|  c|
+---+---+---+
|  1|  2|  3|
|  4|  5|  6|
|  7|  8|  9|
+---+---+---+



In [3]:
d = {'A': [0, 1, 0],
'B': [1, 0, 1],
'C': [1, 0, 0]}

In [7]:
spark.createDataFrame(np.array(list(d.values())).T.tolist(),list(d.keys())).show()

+---+---+---+
|  A|  B|  C|
+---+---+---+
|  0|  1|  1|
|  1|  0|  0|
|  0|  1|  0|
+---+---+---+



In [11]:
data = spark.read.csv('/Users/xue/Desktop/Farrago/Datasets/NormalDatasets/StudentsPerformance.csv',
                     header=True,
                     inferSchema=True)

In [12]:
data.show(4)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
+------+--------------+---------------------------+------------+-----------------------+----------+-----

In [13]:
data.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [14]:
data.dtypes

[('gender', 'string'),
 ('race/ethnicity', 'string'),
 ('parental level of education', 'string'),
 ('lunch', 'string'),
 ('test preparation course', 'string'),
 ('math score', 'int'),
 ('reading score', 'int'),
 ('writing score', 'int')]

In [19]:
data.fillna(99).show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [28]:
data.na.replace(['standard', 'free/reduced'], ['1', '0']).show(5)

+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+-----+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    1|                   none|        72|           72|           74|
|female|       group C|               some college|    1|              completed|        69|           90|           88|
|female|       group B|            master's degree|    1|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|    0|                   none|        47|           57|           44|
|  male|       group C|               some college|    1|                   none|        76|           78|           75|
+------+--------------+---------

In [29]:
data.dtypes

[('gender', 'string'),
 ('race/ethnicity', 'string'),
 ('parental level of education', 'string'),
 ('lunch', 'string'),
 ('test preparation course', 'string'),
 ('math score', 'int'),
 ('reading score', 'int'),
 ('writing score', 'int')]

In [30]:
mapping = {'gender':'male/female', 'lunch':'dinner'}

In [31]:
new_names = [mapping.get(col,col) for col in data.columns]

In [32]:
new_names 

['male/female',
 'race/ethnicity',
 'parental level of education',
 'dinner',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [41]:
for col in data.columns:
    print(mapping.get(col, col))


male/female
race/ethnicity
parental level of education
dinner
test preparation course
math score
reading score
writing score


In [43]:
data.withColumnRenamed('male/female', 'gender').show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

In [44]:
dropname = ['reading score', 'writing score']

In [48]:
data.drop(*['gender', 'lunch']).show(5)

+--------------+---------------------------+-----------------------+----------+-------------+-------------+
|race/ethnicity|parental level of education|test preparation course|math score|reading score|writing score|
+--------------+---------------------------+-----------------------+----------+-------------+-------------+
|       group B|          bachelor's degree|                   none|        72|           72|           74|
|       group C|               some college|              completed|        69|           90|           88|
|       group B|            master's degree|                   none|        90|           95|           93|
|       group A|         associate's degree|                   none|        47|           57|           44|
|       group C|               some college|                   none|        76|           78|           75|
+--------------+---------------------------+-----------------------+----------+-------------+-------------+
only showing top 5 rows



In [50]:
data[(data['math score']>50) & (data['writing score']>80)].show(5)

+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|   lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+--------+-----------------------+----------+-------------+-------------+
|female|       group C|               some college|standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|standard|                   none|        90|           95|           93|
|female|       group B|               some college|standard|              completed|        88|           95|           92|
|  male|       group C|                high school|standard|                   none|        88|           89|           86|
|  male|       group E|               some college|standard|                   none|        97|           87|           82|
+------+

In [51]:
data.count()

1000

In [55]:
from pyspark.sql import functions as F

In [56]:
data.withColumn('cond',F.when((data.gender =='female'),1)\
.when(data['reading score']>90, 2)\
.otherwise(3)).show(4)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|cond|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+----+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|   1|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|   1|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|   1|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|   3|
+------+--------------+---------------------------+------------+-----

In [None]:
from similarity.levenshtein import Levenshtein

levenshtein = Levenshtein()
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))
print(levenshtein.distance('My string', 'My $string'))