In [1]:
import pandas as pd
import numpy as np
import io
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
!pip install pyspark
from pyspark.sql import SparkSession
!pip install pyyaml h5py  # Required to save models in HDF5 format

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/89/db/e18cfd78e408de957821ec5ca56de1250645b05f8523d169803d8df35a64/pyspark-3.1.2.tar.gz (212.4MB)
[K     |████████████████████████████████| 212.4MB 63kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 50.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=bedc6bb84307a6079651f6e262e26a0843ae73e80a27bf6f2f929e8f395b8420
  Stored in directory: /root/.cache/pip/wheels/40/1b/2c/30f43be2627857ab80062bef1527c0128f7b4070b6b2d02139
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
from google.colab import files
uploaded = files.upload()
df = pd.read_csv(io.StringIO(uploaded['sentiment-analysis-dataset.csv'].decode('utf-8')))


Saving sentiment-analysis-dataset.csv to sentiment-analysis-dataset.csv


  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#show data
df.head()


Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,1,0,Sentiment140,is so sad for my APL frie...,,,
1,2,0,Sentiment140,I missed the New Moon trail...,,,
2,3,1,Sentiment140,omg its already 7:30 :O,,,
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,,,
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,,,


In [4]:
#Cleaning Dataframe data
cleandata = df[['ItemID', 'Sentiment', 'SentimentText']]
cleandata.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [5]:
type(cleandata)

pandas.core.frame.DataFrame

In [6]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd

spark = SparkSession.builder.appName('pandasToSparkDF').getOrCreate()
df = spark.createDataFrame(cleandata)

In [7]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['SentimentText']))
data_df.show()

+------+---------+--------------------+------+
|ItemID|Sentiment|       SentimentText|length|
+------+---------+--------------------+------+
|     1|        0|                 ...|    61|
|     2|        0|                 ...|    51|
|     3|        1|              omg...|    37|
|     4|        0|          .. Omga...|   132|
|     5|        0|         i think ...|    53|
|     6|        0|         or i jus...|    42|
|     7|        1|       Juuuuuuuuu...|    41|
|     8|        0|       Sunny Agai...|    61|
|     9|        1|      handed in m...|    53|
|    10|        1|      hmmmm.... i...|    46|
|    11|        0|      I must thin...|    35|
|    12|        1|      thanks to a...|    61|
|    13|        0|      this weeken...|    36|
|    14|        0|     jb isnt show...|    43|
|    15|        0|     ok thats it ...|    25|
|    16|        0|    &lt;-------- ...|    52|
|    17|        0|    awhhe man.......|   101|
|    18|        1|    Feeling stran...|    82|
|    19|     

In [8]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='Sentiment',outputCol='label')
tokenizer = Tokenizer(inputCol="SentimentText", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')


In [9]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [10]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [11]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [12]:
# Show label and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|(262145,[23825,74...|
|  1.0|(262145,[89833,16...|
|  0.0|(262145,[2306,809...|
|  1.0|(262145,[12524,22...|
|  1.0|(262145,[95227,15...|
|  1.0|(262145,[117975,1...|
|  0.0|(262145,[18073,18...|
|  1.0|(262145,[27139,29...|
|  0.0|(262145,[2306,454...|
|  0.0|(262145,[99648,13...|
|  1.0|(262145,[153423,1...|
|  0.0|(262145,[61094,64...|
|  1.0|(262145,[138837,1...|
|  1.0|(262145,[2284,283...|
|  1.0|(262145,[10345,18...|
|  1.0|(262145,[51471,61...|
|  1.0|(262145,[17715,71...|
|  0.0|(262145,[15392,16...|
|  1.0|(262145,[45190,12...|
|  1.0|(262145,[12905,16...|
+-----+--------------------+
only showing top 20 rows



In [13]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [14]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(10)

+------+---------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|ItemID|Sentiment|       SentimentText|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+------+---------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     6|        0|         or i jus...|    42|  1.0|[, , , , , , , , ...|[, , , , , , , , ...|(262144,[117975,1...|(262144,[117975,1...|(262145,[117975,1...|[-238.40334989398...|[0.67939809780290...|       0.0|
|     9|        1|      handed in m...|    53|  0.0|[, , , , , , hand...|[, , , , , , hand...|(262144,[2306,454...|(262144,[2306,454...|(262145,[2306,454...

In [15]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.688897


+------+---------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|ItemID|Sentiment|       SentimentText|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+------+---------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     6|        0|         or i jus...|    42|  1.0|[, , , , , , , , ...|[, , , , , , , , ...|(262144,[117975,1...|(262144,[117975,1...|(262145,[117975,1...|[-238.40334989398...|[0.67939809780290...|       0.0|
|     9|        1|      handed in m...|    53|  0.0|[, , , , , , hand...|[, , , , , , hand...|(262144,[2306,454...|(262144,[2306,454...|(262145,[2306,454...

NameError: ignored