In [1]:
# Imports
import os  # filenames
from time import time  # timing
import numpy as np  # arrays
import pandas as pd  # dataframes
from tqdm import tqdm  # for loop "loading bars"
from bs4 import BeautifulSoup  # xml/sgm parsing
from sklearn.feature_extraction.text import TfidfVectorizer  # scikit learn tfidf

In [2]:
# Spark imports
import findspark
findspark.init("/Users/elliot/spark")  # get spark here
from pyspark.sql import SparkSession  # session to run spark
from pyspark.ml.feature import HashingTF, IDF, Tokenizer  # feature extractor
from pyspark.ml.classification import NaiveBayes  # classifier
from pyspark.sql.functions import udf  # user defined function
from pyspark.sql.types import *  # work with various types in the rdd

In [3]:
# NLTK imports
import nltk  # natural language toolkit
from nltk.stem import PorterStemmer  # stem words
nltk.download('punkt')
from nltk.tokenize import word_tokenize  # tokenize sentences
from nltk.corpus import stopwords  # remove stopwords
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/elliot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elliot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Select the sgm files
data_dir = "data"
data_files = os.listdir(data_dir)
sgm_files = [file for file in data_files if file.endswith('.sgm')]
sgm_files.sort()  # in place sort for sanity

In [5]:
# Create a dataframe of each non-empty topic and body pair, parsed from the reuters xml/sgm data
all_list = []

# Deal with each file
for sgm_file in sgm_files:
    with open(os.path.join(data_dir, sgm_file), 'r') as f:
        xml_data = f.read()  
        document = BeautifulSoup(xml_data, 'html.parser')
        articles = document.find_all('reuters')
        
        # Deal with each article
        for article in articles:
            
            # Extract the body of the article
            body = article.find('body')
            
            topics = []
            
            if body is not None:
                # Extract the topics from the article
                for topic in article.topics.find_all('d'):
                    topic = topic.text.split('-')  # Split dashed topics into two separate categories
                    
                    if topic[0] not in topics:
                        topics.append(topic[0])
                    if len(topic) == 2:
                        if topic[1] not in topics:
                            topics.append(topic[1]) 
                        
                for topic in topics:
                    all_list.append((topic, body.text))
                
                                         
all_articles = pd.DataFrame.from_records(all_list, columns=['topic', 'body'])

# print(all_articles)
# print(all_articles.shape)

In [6]:
# Filter on the selected topics [10 points]

topics = ["money", 
          "fx", 
          "crude", 
          "grain", 
          "trade", 
          "interest", 
          "wheat", 
          "ship", 
          "corn", 
          "oil", 
          "dlr", 
          "gas", 
          "oilseed", 
          "supply", 
          "sugar", 
          "gnp", 
          "coffee", 
          "veg", 
          "gold", 
          "soybean", 
          "bop", 
          "livestock", 
          "cpi"]

topic_filter = all_articles['topic'].isin(topics)
filtered_articles = all_articles[topic_filter]

# print(filtered_articles)
# print(filtered_articles.shape)

In [7]:
# Character removal improvements
clean_list = []

for index, row in tqdm(filtered_articles.iterrows()):
    body = row['body']
    body = body.replace('\n', ' ')
    body = body.replace('/', ' ')
    body = body.replace('-', ' ')
    
    clean_list.append((row['topic'], body))
    
clean_articles = pd.DataFrame.from_records(clean_list, columns=['topic', 'body'])

# print(clean_articles)
# print(clean_articles.shape)

6325it [00:00, 11073.86it/s]


In [8]:
# Tokenize, remove stop words, and stem [10 points]

stop_words = set(stopwords.words('english'))  # use nltk english stopwords
ps = PorterStemmer()  # use porterstemmer

stem_list = []

# Iterate over every article iterations
for index, row in tqdm(filtered_articles.iterrows()):
    
    words = word_tokenize(row['body']) # tokenize the article
    
    # For each word in the article
    stemmed = ""
    for word in words:
        if word.lower() not in stop_words:  # remove stop words
            stemmed += ps.stem(word.lower()) + " "  # stem the word
        
    stem_list.append((row['topic'], stemmed))
    
stemmed_articles = pd.DataFrame.from_records(stem_list, columns=['topic', 'body'])

# print(stemmed_articles)
# print(stemmed_articles.shape)

6325it [00:37, 170.08it/s]


In [9]:
# Write dataframe to a file training_test_data.txt 
# and print the first ten tuples to another file (example_data.txt) [5 points]

stemmed_articles.to_csv("training_test_data.txt", index=False)

print_articles = pd.read_csv("training_test_data.txt")
print_articles.loc[0:9].to_csv("example_data.txt", index=False)

In [10]:
# Non Spark TF-IDF

# read in
doc_data = pd.read_csv("training_test_data.txt", dtype={'topic': str, 'body': str})

start = time()

#tfidf
tfidf_vectorizer = TfidfVectorizer()  # min_df = 1, ngram_range=(1, 1)
x = tfidf_vectorizer.fit_transform(doc_data['body'])

end = time()

print("The non-spark tfidf took {0:.2f} seconds".format(end - start))

x = x.toarray()

# print(x)
# print(x.shape)

The non-spark tfidf took 0.78 seconds


In [11]:
# Spark TF-IDF

# start spark session
spark = SparkSession.builder.appName("NewsClassification").getOrCreate()

# read in
df = spark.read.csv("training_test_data.txt", header=True) 

start = time()

# re-tokenize
tokenizer = Tokenizer(inputCol="body", outputCol="tokens")
df = tokenizer.transform(df)

# tf
hashingTF = HashingTF(inputCol='tokens', outputCol='term_frequency')
df = hashingTF.transform(df)

# idf
idf = IDF(inputCol='term_frequency', outputCol='tfidf', minDocFreq=1)
idfModel = idf.fit(df)
df = idfModel.transform(df)

end = time()

print("The spark tfidf took {0:.2f} seconds".format(end - start))

# df.show()
# df.printSchema()

The spark tfidf took 1.52 seconds


How long do the non-spark and spark versions take to create the tfidf representation? [20 points]

How do the two representations compare? [5 points]


In [12]:
# Fix the labels to be integers
topics = ["money", 
          "fx", 
          "crude", 
          "grain", 
          "trade", 
          "interest", 
          "wheat", 
          "ship", 
          "corn", 
          "oil", 
          "dlr", 
          "gas", 
          "oilseed", 
          "supply", 
          "sugar", 
          "gnp", 
          "coffee", 
          "veg", 
          "gold", 
          "soybean", 
          "bop", 
          "livestock", 
          "cpi"]

# Encode the labels
def label_encoder(topic):
    if topic in topics:
        return topics.index(topic)
    else:
        return -1
        
label_encoder_udf = udf(label_encoder, IntegerType())
df = df.withColumn('topic_int', label_encoder_udf('topic'))

# df.show()
# df.printSchema()

In [13]:
# Split the data into Training-Validation, and Testing Data
df = df.select('tfidf', 'topic_int')

training_validating, testing = df.randomSplit([0.9, 0.1], seed=1937)

# training_validating.show()
# testing.show()

In [14]:
# Split the rest of the data 3 different ways (56/44, 67/33, and 78/22)
# Train each split 10 times using NaiveBayes.train(), and save the 30 models. 
# Report the average accuracy per split [20 points]
# Report the model with the best accuracy of the 30 on the testing dataset, and what that accuracy is [15 points]

avg = 0
models_50 = []
for i in range(0, 10):
    
    # Split the data into training and validating Data
    training, validating = training_validating.randomSplit([0.56, 0.44])

    # Apply naive bayes
    nb = NaiveBayes(featuresCol='tfidf', labelCol='topic_int', predictionCol="NB_pred", probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
    nbModel = nb.fit(training)
    models_50.append(nbModel)

    # Get validation accuracy
    predicting = nbModel.transform(validating)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    avg += correct/total
    
    # Get testing accuracy
    predicting = nbModel.transform(testing)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    print("50% data testing accuracy:", correct/total)
    
print("50% data average validation accuracy: {}".format(avg / 10.0))

avg = 0
models_60 = []
for i in range(0, 10):
    
    # Split the data into training and validating Data
    training, validating = training_validating.randomSplit([0.67, 0.33])
    
    # Apply naive bayes
    nb = NaiveBayes(featuresCol='tfidf', labelCol='topic_int', predictionCol="NB_pred", probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
    nbModel = nb.fit(training)
    models_60.append(nbModel)

    # Get validation accuracy
    predicting = nbModel.transform(validating)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    avg += correct/total
    
    # Get testing accuracy
    predicting = nbModel.transform(testing)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    print("60% training data accuracy:", correct/total)

print("60% data average validation accuracy: {}".format(avg / 10.0))
    
avg = 0
models_70 = []
for i in range(0, 10):
    
    # Split the data into training and validating
    training, validating = training_validating.randomSplit([0.78, 0.22])

    # Apply naive bayes
    nb = NaiveBayes(featuresCol='tfidf', labelCol='topic_int', predictionCol="NB_pred", probabilityCol="NB_prob", rawPredictionCol="NB_rawPred")
    nbModel = nb.fit(training)
    models_70.append(nbModel)

    # Get validation accuracy
    predicting = nbModel.transform(validating)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    avg += correct/total
    
    # Get testing accuracy
    predicting = nbModel.transform(testing)
    total = predicting.count()
    correct = predicting.where(predicting['topic_int'] == predicting['NB_pred']).count()
    print("70% training data accuracy:", correct/total)

print("70% data average validation accuracy: {}".format(avg / 10.0))
    

50% data testing accuracy: 0.328
50% data testing accuracy: 0.3392
50% data testing accuracy: 0.3568
50% data testing accuracy: 0.3376
50% data testing accuracy: 0.344
50% data testing accuracy: 0.3296
50% data testing accuracy: 0.3184
50% data testing accuracy: 0.312
50% data testing accuracy: 0.3312
50% data testing accuracy: 0.3232
50% data average validation accuracy: 0.3386784109632125
60% training data accuracy: 0.3104
60% training data accuracy: 0.32
60% training data accuracy: 0.3184
60% training data accuracy: 0.3088
60% training data accuracy: 0.304
60% training data accuracy: 0.3184
60% training data accuracy: 0.312
60% training data accuracy: 0.3152
60% training data accuracy: 0.312
60% training data accuracy: 0.3248
60% data average validation accuracy: 0.32674124798596893
70% training data accuracy: 0.3088
70% training data accuracy: 0.3008
70% training data accuracy: 0.3216
70% training data accuracy: 0.3104
70% training data accuracy: 0.312
70% training data accuracy: 0

What trends occur when the amount of training data increases? [15 points]

In [15]:
spark.stop()