In [1]:
# Create a folder called dataset
!mkdir -p dataset

# Download the cleaned AG news file
!curl -o dataset/agnews_clean.csv https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews_clean.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 33.2M  100 33.2M    0     0   218M      0 --:--:-- --:--:-- --:--:--  219M


In [6]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create a SparkSession to run PySpark commands
# the .master() command tells Spark to run locally
# the .appName() names it AG news
# the .getOrCreate() starts a new session
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("AG news")
    .getOrCreate()
)

In [7]:
# Load the AG News dataset into a Spark data frame
agnews = spark.read.csv("dataset/agnews_clean.csv", header=True, inferSchema=True)

# Import F and ArrayType and StringType
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType

# The filtered column is currently a string that looks like a list. 
# This converts it into an actual array of strings to work with individual words
agnews = agnews.withColumn("filtered", F.from_json("filtered", ArrayType(StringType())))

# Show the first 5 rows to see if everything looks okay
agnews.show(5, truncate=30)

+---+------------------------------+
|_c0|                      filtered|
+---+------------------------------+
|  0|[wall, st, bears, claw, bac...|
|  1|[carlyle, looks, toward, co...|
|  2|[oil, economy, cloud, stock...|
|  3|[iraq, halts, oil, exports,...|
|  4|[oil, prices, soar, time, r...|
+---+------------------------------+
only showing top 5 rows


25/05/24 22:09:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv


In [8]:
agnews.columns

['_c0', 'filtered']

In [18]:
# Break apart each array of words in the filtered column so each word gets its own row
# I create a new column called word with one word per row per document
exploded = agnews.withColumn("word", F.explode("filtered"))

# Rename the default column name "_c0" to "id"
exploded = exploded.withColumnRenamed("_c0", "id")

# Show the first 20 rows to check that there is one word per row and an id for each doc
exploded.select("id", "word").show(20)

+---+---------+
| id|     word|
+---+---------+
|  0|     wall|
|  0|       st|
|  0|    bears|
|  0|     claw|
|  0|     back|
|  0|    black|
|  0|  reuters|
|  0|  reuters|
|  0|    short|
|  0|  sellers|
|  0|     wall|
|  0|   street|
|  0|dwindling|
|  0|     band|
|  0|    ultra|
|  0|   cynics|
|  0|   seeing|
|  0|    green|
|  1|  carlyle|
|  1|    looks|
+---+---------+
only showing top 20 rows


25/05/24 22:36:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv


In [11]:
# Count how many times each word appears in each document
word_counts = exploded.groupBy("id", "word").count().withColumnRenamed("count", "word_count")

# Count how many total words are in each document
total_words = exploded.groupBy("id").count().withColumnRenamed("count", "total_words")

# Join the word counts and total word counts on the document ID
tf = word_counts.join(total_words, on="id")

# Calculate tf by dividing word count by the total words
tf = tf.withColumn("tf", F.col("word_count") / F.col("total_words"))

# Check the result to see if it looks alright
tf.select("id", "word", "tf").show(5)

25/05/24 22:11:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
25/05/24 22:11:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
[Stage 12:>                                                         (0 + 1) / 1]

+---+----------+--------------------+
| id|      word|                  tf|
+---+----------+--------------------+
| 10|    stocks|                0.04|
| 21|    nation| 0.05555555555555555|
| 36|      news| 0.07692307692307693|
| 44|     salem| 0.02631578947368421|
| 48|government|0.047619047619047616|
+---+----------+--------------------+
only showing top 5 rows


                                                                                

In [12]:
# Count how many documents each word appears in
# Duplicates are dropped so each document and word pair is unique
# The document frequency is counted by how many documents each word shows up in
doc_freq = exploded.select("id", "word").distinct().groupBy("word").count().withColumnRenamed("count", "df")

# Get the total number of documents in the dataset
num_docs = agnews.count()

# Calculate the inverse document frequency
# The log of the total documents divided by document frequency
idf = doc_freq.withColumn("idf", F.log(F.lit(num_docs) / F.col("df")))

# Preview the IDF values
idf.select("word", "idf").show(5)

25/05/24 22:11:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv

+---------+------------------+
|     word|               idf|
+---------+------------------+
|    still|4.0242864276084385|
|arguments| 7.245796143375976|
|   doubts| 6.609161173079373|
|   online|3.9552643296013406|
|   filing| 5.930655542512376|
+---------+------------------+
only showing top 5 rows


                                                                                

In [16]:
# Join the tf and idf table together on "word"
# Each word now has its term frequency and inverse document frequency
tf_idf = tf.join(idf, on="word")

# Multiply the tf and idf together to get the final tf*idf score for each word in each document
tf_idf = tf_idf.withColumn("tf_idf", F.col("tf") * F.col("idf"))

# Filtering the first few documents with id less than 5, sorting by document ID, and then showing the first 50 results
tf_idf.filter(F.col("id") < 5).select("id", "word", "tf_idf").orderBy("id", ascending=True).show(50, truncate=False)

25/05/24 22:13:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
25/05/24 22:13:11 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
25/05/24 22:13:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
[Stage 69:>                                                         (0 + 2) / 2]

+---+-------------+-------------------+
|id |word         |tf_idf             |
+---+-------------+-------------------+
|0  |wall         |0.5115985326511431 |
|0  |cynics       |0.563734318747707  |
|0  |green        |0.2877107940095433 |
|0  |ultra        |0.4125512394225831 |
|0  |claw         |0.499114829314058  |
|0  |back         |0.1892216338539946 |
|0  |st           |0.2584728642725166 |
|0  |sellers      |0.4468379768438066 |
|0  |dwindling    |0.4572386180709258 |
|0  |band         |0.3643421454792778 |
|0  |reuters      |0.24754017186645658|
|0  |bears        |0.3372044607529448 |
|0  |black        |0.2953171727366614 |
|0  |short        |0.2773120373951269 |
|0  |seeing       |0.37743394553516213|
|0  |street       |0.24678348986493034|
|1  |industry     |0.15043731768548949|
|1  |aerospace    |0.2581171817448437 |
|1  |toward       |0.1898997183872362 |
|1  |carlyle      |0.7168306746824437 |
|1  |timed        |0.324478643568105  |
|1  |investment   |0.1890771769001148 |


                                                                                

In [17]:
# Save the final results (with document ID, word, and tf*idf score) to a CSV file
tf_idf.select("id", "word", "tf_idf").write.csv("tfidf_output", header=True)

25/05/24 22:15:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
25/05/24 22:15:28 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
25/05/24 22:15:30 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , filtered
 Schema: _c0, filtered
Expected: _c0 but found: 
CSV file: file:///home/ubuntu/DE300_HW3/dataset/agnews_clean.csv
                                                                                

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/ubuntu/DE300_HW3/tfidf_output already exists. Set mode as "overwrite" to overwrite the existing path. SQLSTATE: 42K04

In [None]:
# PART 2

In [3]:
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/data_for_svm.csv
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/w.csv
!curl -O https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/bias.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 61.9M  100 61.9M    0     0   208M      0 --:--:-- --:--:-- --:--:--  209M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1391  100  1391    0     0  14588      0 --:--:-- --:--:-- --:--:-- 14642
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100    22  100    22    0     0    227      0 --:--:-- --:--:-- --:--:--   229


In [5]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m133.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading numpy-2.2.6-cp312-cp312-manylinux

In [6]:
# Import pands and numpy
import pandas as pd
import numpy as np

# Load the main dataset
X_y = pd.read_csv('data_for_svm.csv', header=None)

# Load the weight vector dataset, reshaping it from a 2D to 1D array
w = pd.read_csv('w.csv', header=None).values.flatten()

# Load the bias dataset
b = pd.read_csv('bias.csv', header=None).values[0][0]

# Split the dataset into features X and labels Y
# This selects all rows and columns except the last
X = X_y.iloc[:, :-1].values

# This selects all rows and the last column
y = X_y.iloc[:, -1].values

In [7]:
def loss_SVM(w, b, X, y, lam=0.01):
    """
    Computes the soft margin SVM objective function
    """

    # Compute the regularization term
    reg_term = lam * np.linalg.norm(w) ** 2

    # Compute hinge losses for all samples
    margins = 1 - y * (X @ w + b)
    hinge_losses = np.maximum(0, margins)

    # Average hinge loss
    avg_hinge = np.mean(hinge_losses)

    # Total loss
    total_loss = reg_term + avg_hinge
    return total_loss

In [8]:
loss = loss_SVM(w, b, X, y)
print("SVM Objective Loss:", loss)

# We can see the SVM objective loss is printed below.

SVM Objective Loss: 0.9997559286225162


In [9]:
def predict_SVM(w, b, X):
    """
    Predict labels for input features using SVM decision rule.
    """
    scores = X @ w + b
    return np.sign(scores)

In [10]:
y_pred = predict_SVM(w, b, X)
print(y_pred[:10])

[-1. -1. -1.  1. -1.  1. -1. -1.  1. -1.]
