In [2]:
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql.functions import col,corr
from pyspark.sql.types import IntegerType
import  pyspark.sql.functions as F
from pyspark.sql.functions import col, count,when


local variable 'spark' referenced before assignment


24/01/19 15:43:32 WARN Utils: Your hostname, Jacobs-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.156 instead (on interface en0)
24/01/19 15:43:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/19 15:43:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/19 15:43:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/01/19 15:43:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Spark Session

In [None]:


def create_spark_session(app_name="Sparkify", default_settings = True ,  total_physical_cores=16,driver_memory = 8,executor_memory = 8):
# Calculate available cores for Spark
    try:
        spark.shutdown()
    except  Exception as e:
        print(e)

    if default_settings == False:
        total_physical_cores = input(" Available Cores")
        driver_memory =  input(" Driver Memory Allowance")
        executor_memory = input("Executor Memory Allowance")

    available_cores_for_spark =int( total_physical_cores - 2)
    # Configure Spark session
    spark = (
        SparkSession.builder.appName(app_name)
        .config("spark.driver.memory", str(int(driver_memory)) + "g")
        .config("spark.executor.memory", str(int(executor_memory)) + "g")
        .config("spark.executor.cores", available_cores_for_spark)
        .getOrCreate()
    )

    return  spark

spark = create_spark_session()

## Data Load

In [24]:
# Read CSV file into DataFrame
file_path ="../data/lg_all_features.csv"
df = spark.read.option("header","True").csv(file_path)
# Drop the "userID" column
df = df.drop("userID")
df = df.drop("_c0")

df = df.select(*(col(c).cast("double").alias(c) for c in df.columns))

df.printSchema()

root
 |-- label: double (nullable = true)
 |-- count: double (nullable = true)
 |-- avg_daily_listens: double (nullable = true)
 |-- level_flag: double (nullable = true)
 |-- pos_interactions: double (nullable = true)
 |-- neg_interactions: double (nullable = true)
 |-- distinct_artist: double (nullable = true)
 |-- About: double (nullable = true)
 |-- Add Friend: double (nullable = true)
 |-- Add to Playlist: double (nullable = true)
 |-- Downgrade: double (nullable = true)
 |-- Error: double (nullable = true)
 |-- Help: double (nullable = true)
 |-- Home: double (nullable = true)
 |-- Login: double (nullable = true)
 |-- Logout: double (nullable = true)
 |-- Register: double (nullable = true)
 |-- Roll Advert: double (nullable = true)
 |-- Save Settings: double (nullable = true)
 |-- Settings: double (nullable = true)
 |-- Submit Downgrade: double (nullable = true)
 |-- Submit Registration: double (nullable = true)
 |-- Submit Upgrade: double (nullable = true)
 |-- Thumbs Down: doubl

## Null Verification

In [25]:

null_values = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
selected_columns = [column for column in null_values.columns if null_values.select(column).first()[0] > 1]
null_values.select(*selected_columns).columns

[]

## MinMax Scaling

In [26]:
def minMaxScalar(df,label):
    df_without_label = df.drop(label)
    # Get the list of numeric columns
    numeric_columns = [col_name for col_name, data_type in df_without_label .dtypes if data_type in ["int", "double"]]

    # Assemble features into a vector column
    assembler = VectorAssembler(inputCols=numeric_columns, outputCol="features")
    assembled_data = assembler.transform(df_without_label )

    # Check if the vector dimension is larger than zero
    if assembled_data.first()["features"].size == 0:
        raise ValueError("Vector dimension is zero. Please check your input data.")

    # Apply Min-Max scaling
    scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
    scaler_model = scaler.fit(assembled_data)
    scaled_data = scaler_model.transform(assembled_data)

    return scaled_data

scaled_features = minMaxScalar(df,"label")

scaled_data = df[['label']].join(scaled_features.select("scaled_features"), how='inner')
scaled_data.show()     

+-----+--------------------+
|label|     scaled_features|
+-----+--------------------+
|  1.0|[0.09025015468929...|
|  1.0|[0.09025015468929...|
|  1.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
|  0.0|[0.09025015468929...|
+-----+--------------------+
only showing top 20 rows



## Pyspark ML Logistic Regression

In [27]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol="scaled_features", labelCol="label", family="binomial")

# Fit the model
lr_model = lr.fit(scaled_data)

# Make predictions
predictions = lr_model.transform(scaled_data)


evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)

accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())

print("F1 Score: {:.4f}".format(f1_score))
print("Accuracy: {:.4f}".format(accuracy))

[Stage 461:>                                                        (0 + 1) / 1]