BDDA Assignment question 1

Divik Mathur; 015031
BDA-01

## Text Classification of Corona Tweets - NLP

In [29]:
from pyspark import SparkContext

In [30]:
#Creating Spark Session
from pyspark.sql import SparkSession

In [31]:
#Initializing the TextClassifier app
spark= SparkSession.builder.appName('TextClassifierApp').getOrCreate()

In [32]:
#Reading file
df = spark.read.csv('Corona_NLP_train.csv', header=True, inferSchema=True)

### About the data

In [33]:
df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|            UserName|          ScreenName|            Location|             TweetAt|       OriginalTweet|Sentiment|
+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|                3799|               48751|              London|          16-03-2020|@MeNyrbie @Phil_G...|  Neutral|
|                3800|               48752|                  UK|          16-03-2020|advice Talk to yo...| Positive|
|                3801|               48753|           Vagabonds|          16-03-2020|Coronavirus Austr...| Positive|
|                3802|               48754|                null|          16-03-2020|My food stock is ...|     null|
|              PLEASE|         don't panic| THERE WILL BE EN...|                null|                null|     null|
|           Stay calm|          stay safe.|                null|

In [34]:
df.columns

['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

In [35]:
df.printSchema()

root
 |-- UserName: string (nullable = true)
 |-- ScreenName: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- TweetAt: string (nullable = true)
 |-- OriginalTweet: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [36]:
#dropping the duplicate values
df = df.dropDuplicates()
print(df.count())

65274


In [37]:
df.toPandas()['Sentiment'].isnull().sum()

36657

In [38]:
df = df.dropna(subset=('Sentiment'))

In [39]:
#filtering out data in the dataset
df = df.filter(df.Sentiment.isin(sentiments))

In [40]:
#defining the list of different sentiments
sentiments = ['Positive','Negative','Neutral','Extremely Positive','Extremely Negative']

### Data Preparation

In [41]:
#Computes the character length of string data
from pyspark.sql.functions import length

In [42]:
#Viewing the length of each and every tweet
df=df.withColumn('length', length(df['OriginalTweet']))

In [43]:
#Viewing the length of each and every tweet
df.show()

+--------+----------+-------------------+----------+--------------------+------------------+------+
|UserName|ScreenName|           Location|   TweetAt|       OriginalTweet|         Sentiment|length|
+--------+----------+-------------------+----------+--------------------+------------------+------+
|    4754|     49706|     In the kitchen|17-03-2020|Going to the groc...|Extremely Positive|   182|
|    4894|     49846|                 ??|17-03-2020|Two grocery store...|          Positive|   185|
|    4927|     49879|     Washington, DC|17-03-2020|For more updates,...|           Neutral|    71|
|    5331|     50283|    London, England|17-03-2020|I think peoples g...|Extremely Negative|   258|
|    5674|     50626|     Pozna?, Polska|17-03-2020|Not only does uni...|Extremely Positive|   240|
|    6238|     51190|                 DC|17-03-2020|Any news on mortg...|          Positive|   199|
|    6392|     51344|               null|17-03-2020|@agirlmegan Know ...|          Negative|   255|


In [44]:
#renaming the column
df=df.withColumnRenamed("Sentiment","sentiment")

In [45]:
#calculating the mean length of different sentiments
df.groupby('Sentiment').mean().show()

+------------------+------------------+
|         Sentiment|       avg(length)|
+------------------+------------------+
|Extremely Negative| 209.6656891495601|
|          Positive|193.66195905675045|
|           Neutral| 151.2949846860643|
|          Negative| 189.6651596908269|
|Extremely Positive| 215.0605167724388|
+------------------+------------------+



### Processing

In [46]:
#Performing Tokeization
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer=Tokenizer(inputCol="OriginalTweet", outputCol="token_text")

In [48]:
#Removing Stopwords from the comments using count vectorizer, inverse document frequency and string indexer to transform the text
stopremove=StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
count_vec=CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf=IDF(inputCol="c_vec", outputCol="tf_idf")

In [49]:
# converting the labels in numbers
label_to_num = StringIndexer(inputCol="sentiment", outputCol='label')

In [50]:
#feature transformer that merges multiple columns into a vector column.
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [51]:
cleaned = VectorAssembler(inputCols=['tf_idf','length'], outputCol='features')

### Creating Model 

In [52]:
from pyspark.ml.classification import DecisionTreeClassifier
#instantitating decision tree classifier

dtc=DecisionTreeClassifier()

### Creating Pipeline

In [53]:
from pyspark.ml import Pipeline

#building a pieline for pre-processing the text 

pipeline= Pipeline(stages=[label_to_num, tokenizer, stopremove, count_vec, idf, cleaned])

In [56]:
#fitting the model

cleaned_data = pipeline.fit(df)

In [57]:
#transforming the data

cleaned_data = cleaned_data.transform(df)

In [58]:
#extracting labels and features 

cleaned_data = cleaned_data.select(['label', 'features'])

In [59]:
#viewing the selected columns

cleaned_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  3.0|(78306,[0,3,6,12,...|
|  0.0|(78306,[0,3,6,51,...|
|  2.0|(78306,[398,10236...|
|  4.0|(78306,[4,7,33,34...|
|  3.0|(78306,[7,10,36,5...|
|  0.0|(78306,[0,1,10,23...|
|  1.0|(78306,[1,5,8,21,...|
|  2.0|(78306,[3,6,88,99...|
|  0.0|(78306,[1,7,9,11,...|
|  0.0|(78306,[13,27,48,...|
|  1.0|(78306,[8,21,28,3...|
|  2.0|(78306,[0,3,57,71...|
|  4.0|(78306,[4,8,17,21...|
|  3.0|(78306,[0,7,17,19...|
|  0.0|(78306,[0,3,6,70,...|
|  3.0|(78306,[0,3,6,16,...|
|  2.0|(78306,[0,3,6,72,...|
|  2.0|(78306,[16,17,117...|
|  1.0|(78306,[10,96,98,...|
|  1.0|(78306,[5,7,9,11,...|
+-----+--------------------+
only showing top 20 rows



### Training the Model

In [60]:
#Spilting the data into train and test

(training, testing)=cleaned_data.randomSplit([0.7,0.3])

### Decision Tree

In [61]:
#fitting the model on the dataset

dtpredictor = dtc.fit(training)

In [62]:
#transforming the model

test_results = dtpredictor.transform(testing)

In [63]:
#viewing the results of the test data
test_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(78306,[0,1,5,7,1...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[0,3,6,51,...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[0,4,16,31...|[186.0,51.0,8.0,2...|[0.37575757575757...|       3.0|
|  0.0|(78306,[0,18,51,5...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[2,18,92,2...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[5,16,19,1...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[5,16,50,1...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[36,102,13...|[3008.0,2728.0,13...|[0.28710508733416...|       0.0|
|  0.0|(78306,[45,262,26...|[1521.0,1356.0,22...|[0.25575920632251...|       2.0|
|  1.0|(78306,[0

### Importing Evaluator 

In [64]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 

In [65]:
#instantiating the classfication evaluator
acc_eval=MulticlassClassificationEvaluator()

In [66]:
#evaluating the accuracy of the model
acc=acc_eval.evaluate(test_results)

In [67]:
print ("Accuracy of the model is::", acc)

Accuracy of the model is:: 0.2921386154999538
