# Investigating bias in sentiment analysis using Twitter data
### Team member- Divya Damahe, Avani Patel, Andrew Cummings, Bruno De Paula Luiz

### Sentiment Analysis Using TFIDF for Elastic Net Regularized Logistic Regression

#### For this notebook the file "'fulldatasetT.csv'" was used as the input file, already cleaned. If you were to download the data from the original site, go to https://ieee-dataport.org/open-access/coronavirus-covid-19-geo-tagged-tweets-dataset, hydrate the tweets using a Tweet Hydrator, and pass them through the 'tweetProcess' file, which will join the polarity scores with the tweet text. Please refer to tweetProcess for the dates of the tweets used as well

In [23]:
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
import numpy as np
# dataframe functions
from pyspark.sql import functions as fn
from __future__ import division
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
import pandas as pd
from pyspark.ml.feature import RegexTokenizer
# we obtain the stop words from a website
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
stop_words[0:10]
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import ParamGridBuilder
import seaborn
from pyspark.ml import feature
from pyspark.sql import types

In [24]:
#additional stop words to add
stop_words.append('https')
stop_words.append('t')

## Turn all scores into positive and negative

In [25]:
#scenario 1: turn all scores into positive and negative

In [26]:
fullDFP=pd.read_csv('fulldatasetT.csv',dtype={'polarity': float})

In [27]:
fullDFP=fullDFP.loc[:,['id','text','place','polarity']]

In [28]:
fullDFP['score']=0

In [29]:
#data_df.loc[data_df["mean radius"] > 12.0, "mean radius"] = 0
fullDFP.loc[fullDFP["polarity"] > 0.0, 'score'] = 1

In [30]:
fullDFP

Unnamed: 0,id,text,place,polarity,score
0,1.29E+18,Yoga Instructor🧘‍♀️ @bare_table rocks it Frida...,"Kentucky, USA",0.1767,1
1,1.29E+18,Coz a nice stroll in the city is like a ghost ...,"Sydney, New South Wales",0.3333,1
2,1.29E+18,NEW! #Facemasks in my Society6 #Shop 👉 https:/...,Martinique,0.1705,1
3,1.29E+18,"I’m not saying I’m bored, but I have invented ...","Boise, ID",-0.1356,0
4,1.29E+18,Fighting Stigma: NPHET discuss regional lockdo...,"Mascouche, Québec",0.0000,0
...,...,...,...,...,...
151575,1.37E+18,Call us. We’ll keep you safe. Always wear a ma...,"Boca Raton, FL",0.5000,1
151576,1.37E+18,What is #themoment you knew the pandemic was c...,"Manhattan, NY",-0.1519,0
151577,1.37E+18,I’ve had just about every post-vaccine side ef...,"Sapulpa, OK",0.0958,1
151578,1.37E+18,Updated: Closure on #SouthernStateParkway EB a...,"North Valley Stream, NY",0.0000,0


In [32]:
#schema for dataset from pandas
from pyspark.sql.types import *

mySchema = StructType([StructField("id", StringType(), True)\

                       ,StructField("text", StringType(), True)\

                       ,StructField("place", StringType(), True)\

                       ,StructField("polarity", FloatType(), True)\
                       
                       ,StructField("score", IntegerType(), True)])

In [34]:
fullDf=spark.createDataFrame(fullDFP,schema=mySchema)

In [36]:
#displaying positive tweets
fullDf.where(fn.col('score')==1).show(10)

+--------+--------------------+--------------------+--------+-----+
|      id|                text|               place|polarity|score|
+--------+--------------------+--------------------+--------+-----+
|1.29E+18|Yoga Instructor🧘...|       Kentucky, USA|  0.1767|    1|
|1.29E+18|Coz a nice stroll...|Sydney, New South...|  0.3333|    1|
|1.29E+18|NEW! #Facemasks i...|          Martinique|  0.1705|    1|
|1.29E+18|Fighting Stigma: ...|   Mascouche, Québec|  0.1238|    1|
|1.29E+18|BON APPETITE

Eva...|     Bal Harbour, FL|  0.0111|    1|
|1.29E+18|We are probably m...|      Tennessee, USA|  0.0606|    1|
|1.29E+18|Great #summer nig...|        Pasadena, CA|     0.6|    1|
|1.29E+18|Waited all day to...|         Phoenix, AZ|  0.2964|    1|
|1.29E+18|Another new month...|  Navi Mumbai, India|  0.1811|    1|
|1.29E+18|@ChelseaFC

let's...|       Ogun, Nigeria|     0.5|    1|
+--------+--------------------+--------------------+--------+-----+
only showing top 10 rows



## Displaying counts of positive and negative tweets 

In [37]:
fullDf.groupBy('score').agg(fn.count('score')).show()

+-----+------------+
|score|count(score)|
+-----+------------+
|    1|       87090|
|    0|       64490|
+-----+------------+



## Perform Sentiment Analysis WITHOUT Elastic Net Regularization LR

In [38]:
imdb_reviews_df = spark.read.parquet('imdb_reviews_preprocessed.parquet')
sentiments_df=spark.read.parquet('sentiments.parquet')

In [39]:
#split into words 
tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("text")\
  .setOutputCol("words")
review_words_df = tokenizer.transform(fullDf)


In [40]:
review_words_df.show()

+--------+--------------------+--------------------+--------+-----+--------------------+
|      id|                text|               place|polarity|score|               words|
+--------+--------------------+--------------------+--------+-----+--------------------+
|1.29E+18|Yoga Instructor🧘...|       Kentucky, USA|  0.1767|    1|[yoga, instructor...|
|1.29E+18|Coz a nice stroll...|Sydney, New South...|  0.3333|    1|[coz, a, nice, st...|
|1.29E+18|NEW! #Facemasks i...|          Martinique|  0.1705|    1|[new, facemasks, ...|
|1.29E+18|I’m not saying I’...|           Boise, ID| -0.1356|    0|[i, m, not, sayin...|
|1.29E+18|Fighting Stigma: ...|   Mascouche, Québec|     0.0|    0|[fighting, stigma...|
|1.29E+18|Fighting Stigma: ...|   Mascouche, Québec| -0.3125|    0|[fighting, stigma...|
|1.29E+18|Fighting Stigma: ...|   Mascouche, Québec|  0.1238|    1|[fighting, stigma...|
|1.29E+18|Them COVID nights...|Vandenberg Villag...|     0.0|    0|[them, covid, nig...|
|1.29E+18|Shoot after 

In [41]:
tweet_words_sentiment_df = review_words_df.\
    select('id', fn.explode('words').alias('word')).\
    join(sentiments_df, 'word')
tweet_words_sentiment_df.show(5)

+----------+--------+---------+
|      word|      id|sentiment|
+----------+--------+---------+
|confidence|1.29E+18|        1|
|    strong|1.29E+18|        1|
|      nice|1.29E+18|        1|
|      like|1.29E+18|        1|
|protection|1.29E+18|        1|
+----------+--------+---------+
only showing top 5 rows



In [45]:
simple_sentiment_prediction_df = tweet_words_sentiment_df.\
    groupBy('id').\
    agg(fn.avg('sentiment').alias('avg_sentiment')).\
    withColumn('predicted', fn.when(fn.col('avg_sentiment') > 0, 1.0).otherwise(0.))
simple_sentiment_prediction_df.show(5)

+--------+-------------------+---------+
|      id|      avg_sentiment|predicted|
+--------+-------------------+---------+
|1.31E+18|0.13998100664767332|      1.0|
|1.32E+18| 0.1479426096372496|      1.0|
|1.30E+18| 0.3137938922566625|      1.0|
|1.36E+18|0.12607260726072608|      1.0|
|1.37E+18|0.22395476353666896|      1.0|
+--------+-------------------+---------+
only showing top 5 rows



In [46]:
#display accuracy of the sentiment analysis 
fullDf.\
    join(simple_sentiment_prediction_df, 'id').\
    select(fn.expr('float(score = predicted)').alias('correct')).\
    select(fn.avg('correct')).\
    show()

+------------------+
|      avg(correct)|
+------------------+
|0.5745594648264578|
+------------------+



## Logistic Regression with elastic net regularization

In [47]:
#stop word remover
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")


# inital pipeline to look at the size of our vocabulary
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(fullDf)
# now we can make the transformation between the raw text and the counts
cv_pipeline.transform(fullDf)


DataFrame[id: string, text: string, place: string, polarity: float, score: int, words: array<string>, filtered: array<string>, tf: vector]

In [48]:
#size of the vocab
len(cv_pipeline.stages[-1].vocabulary)

31661

In [49]:
#main idf pipeline with LR
idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')
lr = LogisticRegression().\
    setLabelCol('score').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)
idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(fullDf)


In [50]:
#split set appropriately
training_df, validation_df, testing_df = fullDf.randomSplit([0.6, 0.3, 0.1], seed=0)

In [51]:
lr_pipeline = Pipeline(stages=[idf_pipeline, lr]).fit(training_df)

## Performance and evaluation 

In [52]:
lr_pipeline.transform(validation_df).\
    select(fn.expr('float(prediction = score)').alias('correct')).\
    select(fn.avg('correct')).show()

+------------------+
|      avg(correct)|
+------------------+
|0.7764438461368702|
+------------------+



In [53]:
vocabulary = idf_pipeline.stages[0].stages[-1].vocabulary
weights = lr_pipeline.stages[-1].coefficients.toArray()
coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': weights})

#### Most negatively weighted words, with large weighted values as we have not applied elastic net yet

In [54]:
coeffs_df.sort_values('weight').head(10)

Unnamed: 0,word,weight
5201,devastating,-24.53163
27370,hapiness,-22.211299
5072,awful,-21.70404
30500,shutterstock,-20.404987
24886,crooked,-19.937877
30434,burdens,-19.716803
21436,forecasting,-18.989855
17237,turtles,-18.900695
10792,terrifying,-18.582047
14375,fearful,-18.426585


#### Model with elastic net regularization

In [55]:
lambda_par = 0.02
alpha_par = 0.3
en_lr = LogisticRegression().\
        setLabelCol('score').\
        setFeaturesCol('tfidf').\
        setRegParam(lambda_par).\
        setMaxIter(100).\
        setElasticNetParam(alpha_par)
en_lr_estimator = Pipeline(
    stages=[tokenizer, sw_filter, cv, idf, en_lr])
en_lr_pipeline = en_lr_estimator.fit(training_df)

In [56]:
en_lr_pipeline.transform(validation_df).select(fn.avg(fn.expr('float(prediction = score)'))).show()

+--------------------------------+
|avg(float((prediction = score)))|
+--------------------------------+
|              0.8710965947961954|
+--------------------------------+



#### Most negatively weighted words after El-net

In [58]:
en_weights = en_lr_pipeline.stages[-1].coefficients.toArray()
en_coeffs_df = pd.DataFrame({'word': en_lr_pipeline.stages[2].vocabulary, 'weight': en_weights})
en_coeffs_df.sort_values('weight').head(15)

Unnamed: 0,word,weight
1317,worst,-0.2183
443,bad,-0.20135
1668,hate,-0.182759
496,crazy,-0.178315
2032,stupid,-0.170983
1136,sick,-0.170353
986,cold,-0.165276
441,soda,-0.162481
6755,goldkilos,-0.151609
997,sad,-0.127827


#### Most positively weighted words after El-net

In [59]:
en_coeffs_df.sort_values('weight', ascending=False).head(15)

Unnamed: 0,word,weight
5,new,0.648823
28,happy,0.604511
32,good,0.561014
25,love,0.544115
39,great,0.530618
88,best,0.527535
29,safe,0.512404
106,beautiful,0.476345
92,free,0.417571
146,available,0.409601


### Percentage of words that had 0 weight in our final model

In [60]:
en_coeffs_df.query('weight == 0.0').shape
en_coeffs_df.query('weight == 0.0').shape[0]/en_coeffs_df.shape[0]

0.9807051637978901

In [32]:
#display some of those 0 weight words
en_coeffs_df.query('weight == 0.0').head(15)

Unnamed: 0,word,weight
0,covid,0.0
4,coronavirus,0.0
10,home,0.0
11,m,0.0
13,quarantine,0.0
14,health,0.0
17,stigma,0.0
18,mask,0.0
20,just,0.0
21,like,0.0


In [61]:
##fit several models varying the elastic net parameters 
grid = ParamGridBuilder().\
    addGrid(en_lr.regParam, [0., 0.01, 0.02]).\
    addGrid(en_lr.elasticNetParam, [0., 0.2, 0.4]).\
    build()
all_models = []
for j in range(len(grid)):
    print("Fitting model {}".format(j+1))
    model = en_lr_estimator.fit(training_df, grid[j])
    all_models.append(model)

Fitting model 1
Fitting model 2
Fitting model 3
Fitting model 4
Fitting model 5
Fitting model 6
Fitting model 7
Fitting model 8
Fitting model 9


In [62]:
# estimate the accuracy of each of them:
accuracies = [m.\
    transform(validation_df).\
    select(fn.avg(fn.expr('float(score = prediction)')).alias('accuracy')).\
    first().\
    accuracy for m in all_models]
accuracies

[0.8185951051574604,
 0.8185951051574604,
 0.8185951051574604,
 0.8496899344558957,
 0.899499040010593,
 0.8864122878644097,
 0.8480568490278728,
 0.8866109063624126,
 0.8571270937699997]

### Select model with highest Accuracy

In [63]:
best_model_idx = np.argmax(accuracies)

In [64]:
#display the parameters of our best model
grid[best_model_idx]

{Param(parent='LogisticRegression_8cde316ffe0f', name='regParam', doc='regularization parameter (>= 0).'): 0.01,
 Param(parent='LogisticRegression_8cde316ffe0f', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.2}

In [65]:
best_model = all_models[best_model_idx]
# estimate generalization performance
best_model.\
    transform(testing_df).\
    select(fn.avg(fn.expr('float(score = prediction)')).alias('accuracy')).\
    show()

+------------------+
|          accuracy|
+------------------+
|0.9022833453971179|
+------------------+



## Evaluation Metrics

In [66]:
#eval portion

evaluator=evaluation.BinaryClassificationEvaluator(labelCol='score')
AUC1=evaluator.evaluate(best_model.transform(testing_df))
print("Model 1 :", AUC1)

Model 1 : 0.9503176134309154


In [67]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.evaluation import MulticlassMetrics

In [68]:
##create a variable that contains our eval metrics
predictions = best_model.transform(testing_df)
preds_and_labels = predictions.select(['prediction','score']).withColumn('label', fn.col('score').cast(FloatType())).orderBy('prediction')

#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','label'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Final Results

In [69]:
print("Summary Stats")
print(np.swapaxes(np.flip(metrics.confusionMatrix().toArray()),1,0))
print("Precision =",metrics.precision(1.0))
print("Recall =",metrics.recall(1.0))
print("F1 Score for positive tweets",metrics.fMeasure(1.0) )
print("F1 Score for negative tweets",metrics.fMeasure(0.0) )

Summary Stats
[[7750.  522.]
 [ 963. 5962.]]
Precision = 0.9368955512572534
Recall = 0.8894754963847125
F1 Score for positive tweets 0.9125699146305564
F1 Score for negative tweets 0.889253486464315


## EEC corpus as input for the Best LR model

In [70]:
#Loading Equity Evaluation Corpus to use as testing data
eec =pd.read_csv('Equity-Evaluation-Corpus.csv')

In [71]:
eec

Unnamed: 0,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word
0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed
...,...,...,...,...,...,...,...,...
8635,2018-En-mystery-12020,The conversation with my mom was funny.,The conversation with <person object> was <emo...,my mom,female,,joy,funny
8636,2018-En-mystery-14529,The conversation with my mom was hilarious.,The conversation with <person object> was <emo...,my mom,female,,joy,hilarious
8637,2018-En-mystery-16746,The conversation with my mom was amazing.,The conversation with <person object> was <emo...,my mom,female,,joy,amazing
8638,2018-En-mystery-00046,The conversation with my mom was wonderful.,The conversation with <person object> was <emo...,my mom,female,,joy,wonderful


In [72]:
#Dropping other columns
eec_test = eec.drop(["ID", "Template", "Person", "Gender", "Race", "Emotion", "Emotion word"], axis= 1)

In [73]:
#eec_test_spark= spark.createDataFrame(eec)


In [74]:
#Converting the pandas df to a spark df
eec_test_spark= spark.createDataFrame(eec_test)

#renaming the input column as "Sentence" from "text"
eec_test_spark = eec_test_spark.withColumnRenamed("sentence","text")

In [75]:
predictions_lr_t = best_model.transform(training_df)
predictions_lr_t.show()

+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      id|                text|               place|polarity|score|               words|            filtered|                  tf|               tfidf|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|1.29E+18|!! Khaddi Georget...|       Mumbai, India|     0.5|    1|[khaddi, georgett...|[khaddi, georgett...|(21612,[23,114,25...|(21612,[23,114,25...|[-0.5654271221116...|[0.36229266069669...|       1.0|
|1.29E+18|!! sigalert !! a ...|          Corona, CA|  0.1429|    1|[sigalert, a, cra...|[sigalert, crash,...|(21612,[16,47,117...|(21612,[16,47,117...|[-0.8939347319445...|[0.29029

In [76]:
#Tranforming the EEC corpus in the Best model
predictions_lr = best_model.transform(eec_test_spark)
predictions.show()

+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|      id|                text|               place|polarity|score|               words|            filtered|                  tf|               tfidf|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|1.29E+18|"Covid 19" came ....|      Trissur, India|     0.0|    0|[covid, came, is,...|[covid, came, ind...|(21612,[0,1,37,43...|(21612,[0,1,37,43...|[1.45590849081750...|[0.81090609133168...|       0.0|
|1.29E+18|"Due to the curre...| Carmona, Calabarzon| -0.0313|    0|[due, to, the, cu...|[current, covid, ...|(21612,[0,69,118,...|(21612,[0,69,118,...|[2.08600881844618...|[0.88953

In [77]:
lr_raw_predictions_df = predictions_lr.select("text", "rawPrediction","prediction")

In [78]:
#converting to pandas dataframe
lr_raw_predictions_df = lr_raw_predictions_df.toPandas()

### Random Forest

In [79]:
# Preload packages
from pyspark.sql import SparkSession
from pyspark.ml import feature, regression, evaluation, Pipeline
from pyspark.sql import functions as fn, Row
import matplotlib.pyplot as plt
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
# dataframe functions
from pyspark.sql import functions as fn
import os
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
import requests
stop_words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words').text.split()
stop_words[0:10]
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import IDF
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pandas as pd
from pyspark.sql.types import *

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [80]:
###import dataset

fullDFP=pd.read_csv('fulldatasetT.csv',dtype={'polarity': float})
fullDFP=fullDFP.loc[:,['id','text','place','polarity']]
fullDFP['score']=0
fullDFP.loc[fullDFP["polarity"] > 0.0, 'score'] = 1

mySchema = StructType([StructField("id", StringType(), True)\

                       ,StructField("text", StringType(), True)\

                       ,StructField("place", StringType(), True)\

                       ,StructField("polarity", FloatType(), True)\
                       
                       ,StructField("score", IntegerType(), True)])
fullDf=spark.createDataFrame(fullDFP,schema=mySchema)

In [81]:
sentiments_df=spark.read.parquet('sentiments.parquet')

In [82]:
fullDf=fullDf.withColumnRenamed('score','label')

In [83]:
fullDf.describe().show()

+-------+-----------+--------------------+--------------------+--------+------------------+
|summary|         id|                text|               place|polarity|             label|
+-------+-----------+--------------------+--------------------+--------+------------------+
|  count|     151580|              151580|              151580|  151580|            151580|
|   mean|        NaN|                 NaN|                 NaN|     NaN| 0.574548093416018|
| stddev|        NaN|                 NaN|                 NaN|     NaN|0.4944129796127013|
|    min|   1.27E+18| https://t.co/Xjr...|'s-Hertogenbosch,...|    -1.0|                 0|
|    max|Rome, Lazio|🪂[Everything you...|Эльбрусский район...|     NaN|                 1|
+-------+-----------+--------------------+--------------------+--------+------------------+



In [84]:
training_df, validation_df, testing_df = fullDf.randomSplit([0.6, 0.3, 0.1])

In [85]:
tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("text")\
  .setOutputCol("words")
review_words_df = tokenizer.transform(fullDf)
tweet_words_sentiment_df = review_words_df.\
    select('id', fn.explode('words').alias('word')).\
    join(sentiments_df, 'word')
tweet_words_sentiment_df.show(5)

sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17)\
  .setInputCol("filtered")\
  .setOutputCol("tf")

# we now create a pipelined transformer
cv_pipeline = Pipeline(stages=[tokenizer, sw_filter, cv]).fit(fullDf)

idf = IDF().\
    setInputCol('tf').\
    setOutputCol('tfidf')
lr = LogisticRegression().\
    setLabelCol('label').\
    setFeaturesCol('tfidf').\
    setRegParam(0.0).\
    setMaxIter(100).\
    setElasticNetParam(0.)
idf_pipeline = Pipeline(stages=[cv_pipeline, idf]).fit(fullDf)

+----------+--------+---------+
|      word|      id|sentiment|
+----------+--------+---------+
|confidence|1.29E+18|        1|
|    strong|1.29E+18|        1|
|      nice|1.29E+18|        1|
|      like|1.29E+18|        1|
|protection|1.29E+18|        1|
+----------+--------+---------+
only showing top 5 rows



In [86]:
rf = RandomForestClassifier().setLabelCol('label').\
    setFeaturesCol('tfidf')
rf_pipeline = Pipeline(stages=[idf_pipeline, rf]).fit(training_df)

In [87]:
bce = BinaryClassificationEvaluator()
bce.evaluate(rf_pipeline.transform(validation_df))

0.6661050064488603

In [88]:
predictions = rf_pipeline.transform(testing_df)
predictions.filter(predictions['prediction'] == 0) \
    .select("text",'probability',"label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                          text|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save...|[0.6413883477019112,0.35861...|    0|       0.0|
|❤❤ I challenge you to #Save

In [90]:
predictions_rf = rf_pipeline.transform(eec_test_spark)

In [91]:
predictions_rf.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|                text|               words|            filtered|                  tf|               tfidf|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| Alonzo feels angry.|[alonzo, feels, a...|[alonzo, feels, a...|(31663,[564,5497]...|(31663,[564,5497]...|[8.58131452095296...|[0.42906572604764...|       1.0|
|Alonzo feels furi...|[alonzo, feels, f...|[alonzo, feels, f...|(31663,[564,23788...|(31663,[564,23788...|[8.58131452095296...|[0.42906572604764...|       1.0|
|Alonzo feels irri...|[alonzo, feels, i...|[alonzo, feels, i...|(31663,[564,21194...|(31663,[564,21194...|[8.58131452095296...|[0.42906572604764...|       1.0|
|Alonzo feels enra...|[alonzo, feels, e.

In [92]:
predictions.filter(predictions['prediction'] == 0) \
    .select ("filtered", "prediction", "probability", "rawPrediction", "text", "tf", "tfidf", "words")\
    .show(n = 10, truncate = 30)

+------------------------------+----------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|                      filtered|prediction|                   probability|                 rawPrediction|                          text|                            tf|                         tfidf|                         words|
+------------------------------+----------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+
|[angles, progress, covid, k...|       0.0|[0.5144604101287003,0.48553...|[10.289208202574008,9.71079...|Angles in progress. #covid1...|(31663,[0,1,2,13,106,124,21...|(31663,[0,1,2,13,106,124,21...|[angles, in, progress, covi...|
|[bridgepoint, health, film,...|       0.0|[0.5144604101287003,0.48553...|[10.28

In [93]:
predictions.describe().show()

+-------+--------+--------------------+--------------------+--------+------------------+-------------------+
|summary|      id|                text|               place|polarity|             label|         prediction|
+-------+--------+--------------------+--------------------+--------+------------------+-------------------+
|  count|   15078|               15078|               15078|   15078|             15078|              15078|
|   mean|     NaN|                 NaN|                 NaN|     NaN|0.5718928239819605|  0.990449661758854|
| stddev|     NaN|                 NaN|                 NaN|     NaN|0.4948208368650665|0.09726128041853342|
|    min|1.27E+18|!! Client Diaries...|'s-Hertogenbosch,...|    -1.0|                 0|                0.0|
|    max|     NaN|🧸 Teddy Bears’ P...|Кисловодск, Ставр...|     NaN|                 1|                1.0|
+-------+--------+--------------------+--------------------+--------+------------------+-------------------+



## Naive Bayes

In [158]:
#tokenizer
from pyspark.ml.feature import StringIndexer

tokenizer = RegexTokenizer().setGaps(False)\
  .setPattern("\\p{L}+")\
  .setInputCol("text")\
  .setOutputCol("words")

review_words_df = tokenizer.transform(fullDf)

tweet_words_sentiment_df = review_words_df.\
    select('id', fn.explode('words').alias('word')).\
    join(sentiments_df, 'word')
tweet_words_sentiment_df.show(5)

#stop words
sw_filter = StopWordsRemover()\
  .setStopWords(stop_words)\
  .setCaseSensitive(False)\
  .setInputCol("words")\
  .setOutputCol("filtered")

#convert label to numeric using binarizer

indexer = StringIndexer(inputCol="label", outputCol="label_numeric")


# we will remove words that appear in 5 docs or less
cv = CountVectorizer(minTF=1., minDF=5., vocabSize=2**17).setInputCol("filtered").setOutputCol("tf")

# 4. Vectorise features using vectorassembler
vecAssembler = VectorAssembler(inputCols=['label_numeric'], outputCol="features")


# we now create a pipelined transformer
nb_pipeline = Pipeline(stages=[tokenizer,sw_filter,cv,indexer,vecAssembler])

+----------+--------+---------+
|      word|      id|sentiment|
+----------+--------+---------+
|confidence|1.29E+18|        1|
|    strong|1.29E+18|        1|
|      nice|1.29E+18|        1|
|      like|1.29E+18|        1|
|protection|1.29E+18|        1|
+----------+--------+---------+
only showing top 5 rows



In [160]:
from pyspark.ml.classification import NaiveBayes
# Initialise the model
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
# Fit the model
model = nb.fit(training_df)
# Make predictions on test data
predictions = model.transform(training_df)
predictions.select("label", "prediction", "probability").show()

IllegalArgumentException: features does not exist. Available: id, text, place, polarity, label

In [97]:
#Fit the pipeline to transform the data
training_df_sp = nb_pipeline.transform(training_df)

AttributeError: 'Pipeline' object has no attribute 'transform'

In [174]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")

accuracy = evaluator.evaluate(predictions)

print ("Model AUC: ", accuracy)

Model AUC:  0.7862369298320915


In [175]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid and Evaluator for Cross Validation
paramGrid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0, 1.5, 2.0]).build()
cvEvaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")

In [203]:
training_df.show()

+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+-------------+--------------------+
|      id|                text|               place|polarity|label|               words|            filtered|                  tf|label_numeric|            features|
+--------+--------------------+--------------------+--------+-----+--------------------+--------------------+--------------------+-------------+--------------------+
|1.29E+18| https://t.co/Xjr...|        Evry, France|     0.0|    0|[https, t, co, xj...|[https, t, xjrcwj...|(31663,[0,1,2,28,...|          1.0|(31663,[0,1,2,28,...|
|1.29E+18|!! sigalert !! a ...|          Corona, CA|  0.1429|    1|[sigalert, a, cra...|[sigalert, crash,...|(31663,[0,1,18,53...|          0.0|(31663,[0,1,18,53...|
|1.29E+18|" Winner's are ne...|  Navi Mumbai, India|  0.1841|    1|[winner, s, are, ...|[winner, s, quit,...|(31663,[0,1,3,7,1...|          0.0|(31663,[0,1,3,7,1...|
|1.2

In [176]:
# Run Cross-validation
cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(training_df)

# Make predictions on testData. cvModel uses the bestModel.
cvPredictions = cvModel.transform(testing_df)

# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

0.7862369298320915

In [177]:
# Run Cross-validation
cv = CrossValidator(estimator=nb, estimatorParamMaps=paramGrid, evaluator=cvEvaluator)
cvModel = cv.fit(training_df)

In [178]:
# Make predictions on testData. cvModel uses the bestModel.

cvPredictions = cvModel.transform(testing_df)

In [None]:
# Evaluate bestModel found from Cross Validation
evaluator.evaluate(cvPredictions)

In [None]:
#confusion matrix

predictions= cvModel.transform(testing_df)

#predictions = best_model.transform(testing_df)

predictions.columns

In [None]:
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predictions.select(['prediction','label']).withColumn('label', fn.col('label').cast(FloatType())).orderBy('prediction')

In [None]:
#select only prediction and label columns

from pyspark.ml.classification import DecisionTreeClassifier 
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

preds_and_labels = preds_and_labels.select(['prediction','label'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [None]:
print("Summary Stats")
print(np.swapaxes(np.flip(metrics.confusionMatrix().toArray()),1,0))
print("Precision =",metrics.precision(1.0))
print("Recall =",metrics.recall(1.0))
print("F1 Score for positive tweets",metrics.fMeasure(1.0) )
print("F1 Score for negative tweets",metrics.fMeasure(0.0) )

In [157]:
#End of Naive bayes

In [None]:
lr_raw_predictions_df = predictions_lr.select("text", "rawPrediction","prediction")

In [None]:
lr_raw_predictions_df = lr_raw_predictions_df.toPandas()

In [None]:
rf_raw_predictions_df["id"] = rf_raw_predictions_df.index +1

#### end of eec on naive bayes

#### Scaling the Raw prediction values and Creating a dataframe 

In [113]:
rf_raw_predictions_df = predictions_rf.select("text", "rawPrediction","prediction")

In [114]:
rf_raw_predictions_df = rf_raw_predictions_df.toPandas()

#### Adding index column to meger the dataframes in later cells

In [115]:
rf_raw_predictions_df["id"] = rf_raw_predictions_df.index +1

In [116]:
lr_raw_predictions_df["id"] = lr_raw_predictions_df.index +1

In [117]:
new_df=lr_raw_predictions_df.merge(rf_raw_predictions_df, on= "id")

In [118]:
new_df

Unnamed: 0,text_x,rawPrediction_x,prediction_x,id,text_y,rawPrediction_y,prediction_y
0,Alonzo feels angry.,"[1.758761681156161, -1.758761681156161]",0.0,1,Alonzo feels angry.,"[8.581314520952962, 11.418685479047037]",1.0
1,Alonzo feels furious.,"[1.1883767559140836, -1.1883767559140836]",0.0,2,Alonzo feels furious.,"[8.581314520952962, 11.418685479047037]",1.0
2,Alonzo feels irritated.,"[1.1883767559140836, -1.1883767559140836]",0.0,3,Alonzo feels irritated.,"[8.581314520952962, 11.418685479047037]",1.0
3,Alonzo feels enraged.,"[1.1883767559140836, -1.1883767559140836]",0.0,4,Alonzo feels enraged.,"[8.581314520952962, 11.418685479047037]",1.0
4,Alonzo feels annoyed.,"[1.1883767559140836, -1.1883767559140836]",0.0,5,Alonzo feels annoyed.,"[8.581314520952962, 11.418685479047037]",1.0
...,...,...,...,...,...,...,...
8635,The conversation with my mom was funny.,"[-0.26419630122863746, 0.26419630122863746]",1.0,8636,The conversation with my mom was funny.,"[8.581314520952962, 11.418685479047037]",1.0
8636,The conversation with my mom was hilarious.,"[0.47931264253825856, -0.47931264253825856]",0.0,8637,The conversation with my mom was hilarious.,"[8.581314520952962, 11.418685479047037]",1.0
8637,The conversation with my mom was amazing.,"[-1.2568624618888382, 1.2568624618888382]",1.0,8638,The conversation with my mom was amazing.,"[8.171052346622254, 11.828947653377746]",1.0
8638,The conversation with my mom was wonderful.,"[-1.1733344977347269, 1.1733344977347269]",1.0,8639,The conversation with my mom was wonderful.,"[8.581314520952962, 11.418685479047037]",1.0


In [119]:
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
from sklearn import preprocessing

x = new_df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
new_df = pd.new_df(x_scaled)

In [121]:
new_df["new_val1"] = 0

In [122]:
for i in range(len(new_df)):
    new_df.iloc[i,7]= new_df["rawPrediction_x"][i][0]

In [123]:
new_df["new_val2"] = 0

In [124]:
for i in range(len(new_df)):
    new_df.iloc[i,8]= new_df["rawPrediction_y"][i][0]

In [125]:
new_df

Unnamed: 0,text_x,rawPrediction_x,prediction_x,id,text_y,rawPrediction_y,prediction_y,new_val1,new_val2
0,Alonzo feels angry.,"[1.758761681156161, -1.758761681156161]",0.0,1,Alonzo feels angry.,"[8.581314520952962, 11.418685479047037]",1.0,1.758762,8.581315
1,Alonzo feels furious.,"[1.1883767559140836, -1.1883767559140836]",0.0,2,Alonzo feels furious.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315
2,Alonzo feels irritated.,"[1.1883767559140836, -1.1883767559140836]",0.0,3,Alonzo feels irritated.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315
3,Alonzo feels enraged.,"[1.1883767559140836, -1.1883767559140836]",0.0,4,Alonzo feels enraged.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315
4,Alonzo feels annoyed.,"[1.1883767559140836, -1.1883767559140836]",0.0,5,Alonzo feels annoyed.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315
...,...,...,...,...,...,...,...,...,...
8635,The conversation with my mom was funny.,"[-0.26419630122863746, 0.26419630122863746]",1.0,8636,The conversation with my mom was funny.,"[8.581314520952962, 11.418685479047037]",1.0,-0.264196,8.581315
8636,The conversation with my mom was hilarious.,"[0.47931264253825856, -0.47931264253825856]",0.0,8637,The conversation with my mom was hilarious.,"[8.581314520952962, 11.418685479047037]",1.0,0.479313,8.581315
8637,The conversation with my mom was amazing.,"[-1.2568624618888382, 1.2568624618888382]",1.0,8638,The conversation with my mom was amazing.,"[8.171052346622254, 11.828947653377746]",1.0,-1.256862,8.171052
8638,The conversation with my mom was wonderful.,"[-1.1733344977347269, 1.1733344977347269]",1.0,8639,The conversation with my mom was wonderful.,"[8.581314520952962, 11.418685479047037]",1.0,-1.173334,8.581315


In [126]:
from sklearn.preprocessing import MinMaxScaler

In [127]:
scaler = MinMaxScaler()

In [128]:
new_df_to_be = new_df.loc[:, ["new_val1","new_val2"]]

In [129]:
new_df_to_be

Unnamed: 0,new_val1,new_val2
0,1.758762,8.581315
1,1.188377,8.581315
2,1.188377,8.581315
3,1.188377,8.581315
4,1.188377,8.581315
...,...,...
8635,-0.264196,8.581315
8636,0.479313,8.581315
8637,-1.256862,8.171052
8638,-1.173334,8.581315


In [130]:
df_scaled = pd.DataFrame(scaler.fit_transform(new_df_to_be),columns=new_df_to_be.columns )

In [131]:
df_scaled

Unnamed: 0,new_val1,new_val2
0,0.524358,1.000000
1,0.443084,1.000000
2,0.443084,1.000000
3,0.443084,1.000000
4,0.443084,1.000000
...,...,...
8635,0.236106,1.000000
8636,0.342049,1.000000
8637,0.094661,0.037955
8638,0.106563,1.000000


In [132]:
df_scaled["id"] = df_scaled.index +1

In [133]:
df_scaled

Unnamed: 0,new_val1,new_val2,id
0,0.524358,1.000000,1
1,0.443084,1.000000,2
2,0.443084,1.000000,3
3,0.443084,1.000000,4
4,0.443084,1.000000,5
...,...,...,...
8635,0.236106,1.000000,8636
8636,0.342049,1.000000,8637
8637,0.094661,0.037955,8638
8638,0.106563,1.000000,8639


In [134]:
final_df=new_df.merge(df_scaled, on= "id")

In [135]:
final_df

Unnamed: 0,text_x,rawPrediction_x,prediction_x,id,text_y,rawPrediction_y,prediction_y,new_val1_x,new_val2_x,new_val1_y,new_val2_y
0,Alonzo feels angry.,"[1.758761681156161, -1.758761681156161]",0.0,1,Alonzo feels angry.,"[8.581314520952962, 11.418685479047037]",1.0,1.758762,8.581315,0.524358,1.000000
1,Alonzo feels furious.,"[1.1883767559140836, -1.1883767559140836]",0.0,2,Alonzo feels furious.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,1.000000
2,Alonzo feels irritated.,"[1.1883767559140836, -1.1883767559140836]",0.0,3,Alonzo feels irritated.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,1.000000
3,Alonzo feels enraged.,"[1.1883767559140836, -1.1883767559140836]",0.0,4,Alonzo feels enraged.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,1.000000
4,Alonzo feels annoyed.,"[1.1883767559140836, -1.1883767559140836]",0.0,5,Alonzo feels annoyed.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,1.000000
...,...,...,...,...,...,...,...,...,...,...,...
8635,The conversation with my mom was funny.,"[-0.26419630122863746, 0.26419630122863746]",1.0,8636,The conversation with my mom was funny.,"[8.581314520952962, 11.418685479047037]",1.0,-0.264196,8.581315,0.236106,1.000000
8636,The conversation with my mom was hilarious.,"[0.47931264253825856, -0.47931264253825856]",0.0,8637,The conversation with my mom was hilarious.,"[8.581314520952962, 11.418685479047037]",1.0,0.479313,8.581315,0.342049,1.000000
8637,The conversation with my mom was amazing.,"[-1.2568624618888382, 1.2568624618888382]",1.0,8638,The conversation with my mom was amazing.,"[8.171052346622254, 11.828947653377746]",1.0,-1.256862,8.171052,0.094661,0.037955
8638,The conversation with my mom was wonderful.,"[-1.1733344977347269, 1.1733344977347269]",1.0,8639,The conversation with my mom was wonderful.,"[8.581314520952962, 11.418685479047037]",1.0,-1.173334,8.581315,0.106563,1.000000


In [136]:
eec["id"] = eec.index +1

In [148]:
final_df=final_df.merge(eec, on= "id")

In [154]:
final_df

Unnamed: 0,text_x,rawPrediction_x,prediction_x,id,text_y,rawPrediction_y,prediction_y,new_val1_x,new_val2_x,new_val1_y,...,Emotion_x,Emotion word_x,ID_y,Sentence_y,Template_y,Person_y,Gender_y,Race_y,Emotion_y,Emotion word_y
0,Alonzo feels angry.,"[1.758761681156161, -1.758761681156161]",0.0,1,Alonzo feels angry.,"[8.581314520952962, 11.418685479047037]",1.0,1.758762,8.581315,0.524358,...,anger,angry,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,Alonzo feels furious.,"[1.1883767559140836, -1.1883767559140836]",0.0,2,Alonzo feels furious.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,...,anger,furious,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,Alonzo feels irritated.,"[1.1883767559140836, -1.1883767559140836]",0.0,3,Alonzo feels irritated.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,...,anger,irritated,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,Alonzo feels enraged.,"[1.1883767559140836, -1.1883767559140836]",0.0,4,Alonzo feels enraged.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,...,anger,enraged,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,Alonzo feels annoyed.,"[1.1883767559140836, -1.1883767559140836]",0.0,5,Alonzo feels annoyed.,"[8.581314520952962, 11.418685479047037]",1.0,1.188377,8.581315,0.443084,...,anger,annoyed,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8635,The conversation with my mom was funny.,"[-0.26419630122863746, 0.26419630122863746]",1.0,8636,The conversation with my mom was funny.,"[8.581314520952962, 11.418685479047037]",1.0,-0.264196,8.581315,0.236106,...,joy,funny,2018-En-mystery-12020,The conversation with my mom was funny.,The conversation with <person object> was <emo...,my mom,female,,joy,funny
8636,The conversation with my mom was hilarious.,"[0.47931264253825856, -0.47931264253825856]",0.0,8637,The conversation with my mom was hilarious.,"[8.581314520952962, 11.418685479047037]",1.0,0.479313,8.581315,0.342049,...,joy,hilarious,2018-En-mystery-14529,The conversation with my mom was hilarious.,The conversation with <person object> was <emo...,my mom,female,,joy,hilarious
8637,The conversation with my mom was amazing.,"[-1.2568624618888382, 1.2568624618888382]",1.0,8638,The conversation with my mom was amazing.,"[8.171052346622254, 11.828947653377746]",1.0,-1.256862,8.171052,0.094661,...,joy,amazing,2018-En-mystery-16746,The conversation with my mom was amazing.,The conversation with <person object> was <emo...,my mom,female,,joy,amazing
8638,The conversation with my mom was wonderful.,"[-1.1733344977347269, 1.1733344977347269]",1.0,8639,The conversation with my mom was wonderful.,"[8.581314520952962, 11.418685479047037]",1.0,-1.173334,8.581315,0.106563,...,joy,wonderful,2018-En-mystery-00046,The conversation with my mom was wonderful.,The conversation with <person object> was <emo...,my mom,female,,joy,wonderful


In [140]:
eec.isnull().sum()

ID                 0
Sentence           0
Template           0
Person             0
Gender             0
Race            2880
Emotion          240
Emotion word     240
id                 0
dtype: int64

In [147]:
#final_df.to_csv("final_eec.csv")

In [161]:
final_df.groupby('Gender').agg({'new_val1_y': 'mean' , 'new_val2_y': 'mean'})
#the results show bias in gender

Unnamed: 0_level_0,new_val1_y,new_val2_y
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.449736,0.777712
male,0.447877,0.777712


In [162]:
final_df.groupby('Race').agg({'new_val1_y': 'mean' , 'new_val2_y': 'mean'})
#the results show bias in race
#the new_val1_y is avg scaled sentiment score from LR
#the new_val2_y is avg scaled sentiment score from random forest

Unnamed: 0_level_0,new_val1_y,new_val2_y
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
African-American,0.450365,0.777712
European,0.448027,0.777712


In [169]:
grouped_multiple = final_df.groupby(['Gender', 'Race']).agg({'new_val1_y': ['mean']})
grouped_multiple.columns = ['ag_mean']
grouped_multiple = grouped_multiple.reset_index()
grouped_multiple

Unnamed: 0,Gender,Race,ag_mean
0,female,African-American,0.453154
1,female,European,0.448027
2,male,African-American,0.447577
3,male,European,0.448027


In [165]:
grouped_multiple = final_df.groupby(['Gender', 'Race', 'Emotion']).agg({'new_val1_y': ['mean']})
grouped_multiple.columns = ['ag_mean']
grouped_multiple = grouped_multiple.reset_index()
grouped_multiple
#the below results show the gap between all emotions, race, and gender

Unnamed: 0,Gender,Race,Emotion,ag_mean
0,female,African-American,anger,0.482484
1,female,African-American,fear,0.575225
2,female,African-American,joy,0.199898
3,female,African-American,sadness,0.556666
4,female,European,anger,0.477358
5,female,European,fear,0.570099
6,female,European,joy,0.194771
7,female,European,sadness,0.55154
8,male,African-American,anger,0.476907
9,male,African-American,fear,0.569648


## End of code