In [None]:
#1

In [5]:
# Example: Linear Regression with Spark MLlib
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Initialize Spark Session
spark = SparkSession.builder.appName('MLlib Example').getOrCreate()

# Load sample data
data = [(1, 5.0, 20.0), (2, 10.0, 25.0), (3, 15.0, 30.0), (4, 20.0, 35.0)]
columns = ['ID', 'Feature', 'Target']
df = spark.createDataFrame(data, columns)

# Prepare data for modeling
assembler = VectorAssembler(inputCols=['Feature'], outputCol='Features')
df_transformed = assembler.transform(df)

# Train a linear regression model
lr = LinearRegression(featuresCol='Features', labelCol='Target')
model = lr.fit(df_transformed)

# Print model coefficients
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')

25/11/16 20:07:13 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/11/16 20:07:14 WARN Instrumentation: [1ac15d80] regParam is zero, which might cause numerical instability and overfitting.


Coefficients: [0.9999999999999992]
Intercept: 15.000000000000009


In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

# Initialize Spark session
spark = SparkSession.builder.appName('LogisticRegressionExample').getOrCreate()

# Prepare dataset with vector features
data = [
    (1, Vectors.dense([2.0, 3.0]), 0),
    (2, Vectors.dense([1.0, 5.0]), 1),
    (3, Vectors.dense([2.5, 4.5]), 1),
    (4, Vectors.dense([3.0, 6.0]), 0)
]
columns = ['ID', 'Features', 'Label']
df = spark.createDataFrame(data, columns)

# Train logistic regression model
lr = LogisticRegression(featuresCol='Features', labelCol='Label')
model = lr.fit(df)

# Display coefficients and intercept
print(f'Coefficients: {model.coefficients}')
print(f'Intercept: {model.intercept}')


25/11/16 20:07:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Coefficients: [-12.262057936833582,4.087352269037336]
Intercept: 11.568912734308249


In [8]:
# Practice: KMeans Clustering
from pyspark.ml.clustering import KMeans
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

# Initialize Spark session
spark = SparkSession.builder.appName('KMeansExample').getOrCreate()

# Prepare dataset with vector features
data = [
    (1, Vectors.dense([1.0, 1.0])),
    (2, Vectors.dense([5.0, 5.0])),
    (3, Vectors.dense([10.0, 10.0])),
    (4, Vectors.dense([15.0, 15.0]))
]
columns = ['ID', 'Features']
df = spark.createDataFrame(data, columns)

# Train KMeans clustering model
kmeans = KMeans(featuresCol='Features', k=2)
model = kmeans.fit(df)

# Show cluster centers
centers = model.clusterCenters()
print(f'Cluster Centers: {centers}')

25/11/16 20:09:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Cluster Centers: [array([12.5, 12.5]), array([3., 3.])]


In [None]:
#HW

In [10]:
import kagglehub
import os
import pandas as pd

path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")
print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in dataset folder:", files)

csv_file = os.path.join(path, "twitter_training.csv") 
df_pd = pd.read_csv(csv_file, encoding="utf-8") 
print(df_pd.head())
print("Dataset shape:", df_pd.shape)


Path to dataset files: /home/vboxuser/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2
Files in dataset folder: ['twitter_validation.csv', 'twitter_training.csv']
   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     
Dataset shape: (74681, 4)


In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("TwitterSentimentAnalysis") \
    .getOrCreate()

df = spark.createDataFrame(df_pd)
df.show(5)
df.printSchema()


25/11/16 20:14:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
25/11/16 20:14:13 WARN TaskSetManager: Stage 88 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.


+----+-----------+--------+-----------------------------------------------------+
|2401|Borderlands|Positive|im getting on borderlands and i will murder you all ,|
+----+-----------+--------+-----------------------------------------------------+
|2401|Borderlands|Positive|                                 I am coming to th...|
|2401|Borderlands|Positive|                                 im getting on bor...|
|2401|Borderlands|Positive|                                 im coming on bord...|
|2401|Borderlands|Positive|                                 im getting on bor...|
|2401|Borderlands|Positive|                                 im getting into b...|
+----+-----------+--------+-----------------------------------------------------+
only showing top 5 rows
root
 |-- 2401: long (nullable = true)
 |-- Borderlands: string (nullable = true)
 |-- Positive: string (nullable = true)
 |-- im getting on borderlands and i will murder you all ,: string (nullable = true)



In [19]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.ml import Pipeline

label_indexer = StringIndexer(inputCol="Positive", outputCol="label") 
tokenizer = Tokenizer(inputCol="im getting on borderlands and i will murder you all ,", outputCol="words") 

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

preprocessing_pipeline = Pipeline(stages=[label_indexer, tokenizer, remover, hashingTF, idf])
preprocessed_data = preprocessing_pipeline.fit(df).transform(df)

preprocessed_data.select("label", "words", "features").show(5, truncate=False)


25/11/16 20:23:32 WARN TaskSetManager: Stage 211 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:23:33 WARN TaskSetManager: Stage 217 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+-----+----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|label|words                                                                 |features                                                                                                                                           |
+-----+----------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|1.0  |[i, am, coming, to, the, borders, and, i, will, kill, you, all,]      |(10000,[1161,4409,5201,6586],[6.586265389999397,5.131949502782186,7.110120514055721,4.912895936719502])                                            |
|1.0  |[im, getting, on, borderlands, and, i, will, kill, you, all,]         |(10000,[1161,2

25/11/16 20:23:34 WARN TaskSetManager: Stage 218 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.


In [20]:
train_data, test_data = preprocessed_data.randomSplit([0.8, 0.2], seed=42)


In [22]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol="label")

lr_model = lr.fit(train_data)

print("Coefficient Matrix:\n", lr_model.coefficientMatrix)
print("Intercept Vector:\n", lr_model.interceptVector)

25/11/16 20:25:22 WARN TaskSetManager: Stage 325 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:23 WARN TaskSetManager: Stage 326 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:24 WARN TaskSetManager: Stage 327 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:24 WARN TaskSetManager: Stage 328 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:24 WARN TaskSetManager: Stage 329 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:24 WARN TaskSetManager: Stage 330 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:25:24 WARN TaskSetManager: Stage 331 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.

Coefficient Matrix:
 DenseMatrix([[ 2.00487113,  0.33944019, -0.2729883 , ...,  1.21396079,
              -3.13027651, -1.70424896],
             [ 2.38081376,  0.8466788 , -0.09375376, ..., -1.96510747,
               2.6203698 , -0.62964589],
             [-1.02306007, -0.14641187, -1.60616593, ...,  0.79172071,
               2.77850813,  2.34836429],
             [-3.36262481, -1.03970712,  1.97290799, ..., -0.04057402,
              -2.26860142, -0.01446945]])
Intercept Vector:
 [0.3348234890201146,0.43736223539597874,0.00804616614561851,-0.7802318905617119]


In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = lr_model.transform(test_data)

predictions.select("label", "prediction", "probability").show(20, truncate=False)

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)

accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.4f}")


25/11/16 20:26:50 WARN TaskSetManager: Stage 437 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+-----+----------+----------------------------------------------------------------------------------------+
|label|prediction|probability                                                                             |
+-----+----------+----------------------------------------------------------------------------------------+
|0.0  |0.0       |[0.9992311223406702,3.9058940474305846E-8,7.688095237406911E-4,2.9076648536000017E-8]   |
|0.0  |0.0       |[0.9999999997324449,2.6755510901890877E-10,5.64872460014064E-34,1.4180765190674046E-24] |
|0.0  |0.0       |[1.0,3.98432189300851E-31,5.086878362468411E-45,2.3578402872212698E-30]                 |
|2.0  |2.0       |[5.692487560192432E-10,0.039315091187276836,0.9606849082434745,7.833585793008625E-20]   |
|0.0  |0.0       |[0.9999999999999534,4.206130140397243E-14,4.610244930975534E-15,1.6844030267513463E-25] |
|0.0  |0.0       |[0.9999999999999836,1.637574754478724E-14,1.5389534021770488E-25,2.5112041503808037E-28]|
|3.0  |3.0       |[2.8169971

25/11/16 20:26:50 WARN TaskSetManager: Stage 438 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
[Stage 438:>                                                        (0 + 4) / 4]

Test Accuracy: 0.6837


                                                                                

In [26]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .addGrid(lr.maxIter, [50, 100]) \
    .build()

evaluator = MulticlassClassificationEvaluator(
    labelCol="label", 
    predictionCol="prediction", 
    metricName="accuracy"
)

crossval = CrossValidator(
    estimator=lr,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3  
)

cv_model = crossval.fit(train_data)

best_model = cv_model.bestModel

print("Best regParam:", best_model._java_obj.getRegParam())
print("Best elasticNetParam:", best_model._java_obj.getElasticNetParam())
print("Best maxIter:", best_model._java_obj.getMaxIter())

predictions = best_model.transform(test_data)
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy after CV: {accuracy:.4f}")


25/11/16 20:28:03 WARN TaskSetManager: Stage 440 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 441 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 442 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 443 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 444 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 445 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
25/11/16 20:28:05 WARN TaskSetManager: Stage 446 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.

Best regParam: 0.01
Best elasticNetParam: 0.0
Best maxIter: 50


25/11/16 20:31:03 WARN TaskSetManager: Stage 2314 contains a task of very large size (2027 KiB). The maximum recommended task size is 1000 KiB.
[Stage 2314:>                                                       (0 + 4) / 4]

Test Accuracy after CV: 0.7293


                                                                                