In [1]:
# importing required libraries
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.streaming import StreamingContext
import pyspark.sql.types as SQLTypes
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row, Column
import sys

# Create Spark Session.
sc = SparkContext()
spark = SparkSession(sc)

In [2]:
# Define the schema of our dataset
my_schema = SQLTypes.StructType([
    SQLTypes.StructField(name='id', dataType=SQLTypes.IntegerType(), nullable=True),
    SQLTypes.StructField(name='label', dataType=SQLTypes.IntegerType(), nullable=True),
    SQLTypes.StructField(name='tweet', dataType=SQLTypes.StringType(), nullable=True)
])

# Load the csv
my_data = spark.read.csv('Twitter_HateSpeech.csv', schema=my_schema, header=True)

# For visualizing
my_data.show(5)

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+
only showing top 5 rows



In [3]:
# Define our ML pipeline, which we will pass streamed tweets through to generate a prediction

# Stage 1: Tokenize the tweet
stage1 = RegexTokenizer(inputCol = 'tweet', outputCol = 'tokens', pattern = '\\W')

# Stage 2: Remove stop words
stage2 = StopWordsRemover(inputCol = 'tokens', outputCol = 'filtered_words')

# Stage 3: Create a word vector (size 100 will do, since tweets can only contain 140 characters max)
stage3 = Word2Vec(inputCol = 'filtered_words', outputCol = 'vector', vectorSize = 100)

# Stage 4: Pass the data into a logistic regression model
model = LogisticRegression(featuresCol = 'vector', labelCol = 'label')



# Assemble the pipleline
pipeline = Pipeline(stages = [stage1, stage2, stage3, model])

pipelineFit = pipeline.fit(my_data)

In [4]:
def get_prediction(tweet_text):
    try:
        # remove blank tweets
        tweet_text = tweet_text.filter(lambda x: len(x) > 0)
        
        # Create the dataframe with each row being a tweet text
        rowRdd = tweet_text.map(lambda w: Row(tweet=w))
        wordsDataFrame = spark.createDataFrame(rowRdd)
        
        # get the prediction for each row
        df = pipelineFit.transform(wordsDataFrame).select('tweet', 'prediction').show()
        
        send_df_to_dashboard(df)
        
    except Exception as ex:
        print(ex)

TODO
1. alter send_df_to_dashboard() to handle the new dataframe schema coming in. (See ouput of 2nd print() statement below for schema details).
2. alter main() - change 'words.foreachRDD(process)' to 'words.foreachRDD(get_prediction)'

In [5]:
# this section not needed, used for testing only.

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

test_data = spark.read.csv('Twitter_HateSpeech_test.csv', schema=my_schema, header=False)

evaluator = MulticlassClassificationEvaluator(labelCol="label", 
                                               predictionCol="prediction", 
                                               metricName="accuracy")

test_predictions = pipelineFit.transform(test_data)
print("Accuracy:", evaluator.evaluate(test_predictions))
print(test_predictions)

Accuracy: 0.9389920424403183
DataFrame[id: int, label: int, tweet: string, tokens: array<string>, filtered_words: array<string>, vector: vector, rawPrediction: vector, probability: vector, prediction: double]
