In [17]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

df = spark.sql('''select 'spark' as hello ''')
df.show()


+-----+
|hello|
+-----+
|spark|
+-----+



In [None]:
spark = SparkSession.builder.appName("Practice").getOrCreate()

In [None]:
df_pyspark = spark.read.csv("iris.csv")
df_pyspark.show()

In [None]:
spark.read.option("header", "true").csv("iris.csv").show()

In [None]:
type(df_pyspark)

In [None]:
df_pyspark.head(3)

In [None]:
df_pyspark.printSchema()

In [None]:
df_pyspark = spark.read.csv("iris.csv", header=True, inferSchema=True)
df_pyspark.show()

In [None]:
df_pyspark.select(['Name', 'Experience']).show()

In [None]:
df_pyspark.describe().show()

In [None]:
# Add columns in data frame
df_pyspark = df_pyspark.withColumn("Exp after 10 years", df_pyspark['Experience']+10)
df_pyspark.show()

In [None]:
# Drop the columns
df_pyspark = df_pyspark.drop("Exp after 10 years")

In [None]:
# Rename the columns
df_pyspark.withColumnRenamed('Experience', 'Work exp').show()

In [None]:
df_pyspark = spark.read.csv("book2missing_values.csv")
df_pyspark.show()

In [None]:
df_pyspark.na.drop().show()

In [None]:
df_pyspark.na.drop(how="any", thresh=2).show()

In [None]:
df_pyspark = spark.read.csv("book2missing_values.csv", header=True, inferSchema=True)

In [None]:
df_pyspark.na.drop(how="any", subset=['Experience']).show()

In [None]:
# Filling the missing values
df_pyspark.na.fill(0).show()

In [None]:
df_pyspark.na.fill(0, ['Experience', 'age']).show()

In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['Age', 'Experience', 'Salary'],
    outputCols= ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]).setStrategy("mean")


In [None]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

In [None]:
# Filtering operations
df_pyspark.filter('Salary<=50000').show()

In [None]:
df_pyspark.filter('Salary<=50000').select(['Name', 'Age']).show()

In [None]:
df_pyspark.filter((df_pyspark['Salary']<=50000) & (df_pyspark['Salary']>=30000)).show()

In [None]:
# Group by and aggregate functions
df_pyspark = spark.read.csv("book3dept.csv", header=True, inferSchema=True)
df_pyspark.show()

In [None]:
df_pyspark.groupby("Department").sum().show()

In [None]:
df_pyspark.groupby("Department").mean().show()

In [None]:
df_pyspark.groupby("Department").count().show()

In [None]:
df_pyspark.agg({'Salary': 'sum'}).show()

In [None]:
df_pyspark.groupBy('Name').max().show()

In [None]:
df_pyspark = spark.read.csv("Book1.csv", header=True, inferSchema=True)
df_pyspark.show()

In [None]:
training = spark.read.csv("Book1.csv", header=True, inferSchema=True)
training.show()

In [None]:
training.columns

In [None]:
# age, exp ----> new feature -----> independent
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age", "Experience"], outputCol="Independent features")

In [None]:
output = featureassembler.transform(training)
output.show()

In [None]:
finalized_data = output.select("Independent features", "Salary")
finalized_data.show()

In [None]:
from pyspark.ml.regression import LinearRegression

# train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="Independent features", labelCol="Salary")
regressor = regressor.fit(train_data)

In [None]:
# coefficients
regressor.coefficients

In [None]:
# Intercept
regressor.intercept

In [None]:
# prediction
results = regressor.evaluate(test_data)
results.predictions.show()