<a href="https://colab.research.google.com/github/erickodeny-gif/datacamp-data-analyst-with-python/blob/master/pyspark_linear_regression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


In [None]:

# Initialize Spark session
spark = SparkSession.builder.appName("WineQualityRegression").getOrCreate()

In [None]:

# Load the CSV file
data = spark.read.csv("winequality-red.csv", header=True, inferSchema=True, sep=",")

In [None]:
# Check for missing values
data = data.na.drop()


In [None]:

# Define feature columns (all columns except 'quality')
feature_columns = [col for col in data.columns if col != 'quality']


In [None]:
# Create feature vector
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
# Drop the 'features' column if it already exists before transforming
if "features" in data.columns:
    data = data.drop("features")
data = assembler.transform(data)

In [None]:
# Select features and target
final_data = data.select("features", "quality")


In [None]:


# Split data into training and test sets
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)


In [None]:

# Initialize and train the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="quality")
lr_model = lr.fit(train_data)


In [None]:
# Make predictions on test data
predictions = lr_model.transform(test_data)


In [None]:
# Evaluate the model
rmse_evaluator = RegressionEvaluator(labelCol="quality", predictionCol="prediction", metricName="rmse")
r2_evaluator = RegressionEvaluator(labelCol="quality", predictionCol="prediction", metricName="r2")

rmse = rmse_evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)


In [None]:

# Print results
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")


Root Mean Squared Error (RMSE): 0.6747
R-squared (R2): 0.3496


In [None]:
# Print model coefficients
print("\nModel Coefficients:")
for feature, coef in zip(feature_columns, lr_model.coefficients):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {lr_model.intercept:.4f}")



Model Coefficients:
fixed acidity: 0.0394
volatile acidity: -1.0257
citric acid: -0.2147
residual sugar: 0.0088
chlorides: -1.5606
free sulfur dioxide: 0.0058
total sulfur dioxide: -0.0032
density: -15.6626
pH: -0.3384
sulphates: 0.9071
alcohol: 0.2881
Intercept: 19.2050


In [None]:

# Stop the Spark session
spark.stop()