In [1]:
#https://docs.python.org/3/library/math.html
import math

#https://numpy.org/doc/stable/reference/
import numpy as np

from scipy.stats import ttest_ind
import researchpy as rp

#https://pandas.pydata.org/docs/reference/index.html
import pandas as pd

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

import swifter

import json

from IPython.display import HTML

#https://seaborn.pydata.org/api.html
import seaborn as sns

In [2]:
nltk.set_proxy('http://sia-lb.telekom.de:8080')
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\A783703\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [34]:
def get_sentiment(text: str) -> str:
    compound = sia.polarity_scores(text)["compound"]
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else: 
        return "neutral"

In [35]:
def get_compound(text: str) -> float:
    return sia.polarity_scores(text)["compound"]

In [36]:
df = pd.read_json('output.manifest', lines=True)
df.convert_dtypes()

Unnamed: 0,source,sentiment,sentiment-metadata
0,"""""""It has to be part of the menu,"""" says @JimC...",2,"{'class-name': 'negative', 'job-name': 'labeli..."
1,"""This week, @intotheblock takes a dive into th...",0,"{'class-name': 'positive', 'job-name': 'labeli..."
2,"""Novogratz Predicts, #Bitcoin Price Will Hit $...",1,"{'class-name': 'neutral', 'job-name': 'labelin..."
3,"""#BTC rallies to extend Elon Musk Tesla gains,...",0,"{'class-name': 'positive', 'job-name': 'labeli..."
4,Not everyone is impressed by the electric vehi...,0,"{'class-name': 'positive', 'job-name': 'labeli..."
...,...,...,...
495,"""Nigeria’s 🇳🇬 peer-to-peer #bitcoin trading v...",0,"{'class-name': 'positive', 'job-name': 'labeli..."
496,Dubai-based investment company IBC Group plans...,1,"{'class-name': 'neutral', 'job-name': 'labelin..."
497,Are you all in on the #NBAChampionship? Get in...,0,"{'class-name': 'positive', 'job-name': 'labeli..."
498,"""New airdrop: Icarus Network (ICA) Total Rewar...",0,"{'class-name': 'positive', 'job-name': 'labeli..."


In [37]:
df_metadata=pd.json_normalize(df['sentiment-metadata'])

In [38]:
df_normal = pd.concat([df, df_metadata], axis = 1).convert_dtypes() #Zusammenführen der Daten

In [39]:
df_normal.head()

Unnamed: 0,source,sentiment,sentiment-metadata,class-name,job-name,confidence,type,human-annotated,creation-date
0,"""""""It has to be part of the menu,"""" says @JimC...",2,"{'class-name': 'negative', 'job-name': 'labeli...",negative,labeling-job/tweets,0.55,groundtruth/text-classification,yes,2022-09-09T16:49:52.147291
1,"""This week, @intotheblock takes a dive into th...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.56,groundtruth/text-classification,yes,2022-09-09T16:48:11.991534
2,"""Novogratz Predicts, #Bitcoin Price Will Hit $...",1,"{'class-name': 'neutral', 'job-name': 'labelin...",neutral,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:48:38.131990
3,"""#BTC rallies to extend Elon Musk Tesla gains,...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:53:23.023535
4,Not everyone is impressed by the electric vehi...,0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.53,groundtruth/text-classification,yes,2022-09-09T16:50:38.609277


In [40]:
df_normal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   source              500 non-null    string 
 1   sentiment           500 non-null    Int64  
 2   sentiment-metadata  500 non-null    object 
 3   class-name          500 non-null    string 
 4   job-name            500 non-null    string 
 5   confidence          500 non-null    float64
 6   type                500 non-null    string 
 7   human-annotated     500 non-null    string 
 8   creation-date       500 non-null    string 
dtypes: Int64(1), float64(1), object(1), string(6)
memory usage: 35.8+ KB


In [41]:
df_normal['vader-compound'] = df_normal.swifter.apply(lambda x: get_compound(x['source']), axis = 1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=500.0, style=ProgressStyle(description…




In [42]:
df_normal['vader-sentiment'] = df_normal.swifter.apply(lambda x: get_sentiment(x['source']), axis = 1)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=500.0, style=ProgressStyle(description…




In [43]:
df_normal.head()

Unnamed: 0,source,sentiment,sentiment-metadata,class-name,job-name,confidence,type,human-annotated,creation-date,vader-compound,vader-sentiment
0,"""""""It has to be part of the menu,"""" says @JimC...",2,"{'class-name': 'negative', 'job-name': 'labeli...",negative,labeling-job/tweets,0.55,groundtruth/text-classification,yes,2022-09-09T16:49:52.147291,0.0,neutral
1,"""This week, @intotheblock takes a dive into th...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.56,groundtruth/text-classification,yes,2022-09-09T16:48:11.991534,0.0,neutral
2,"""Novogratz Predicts, #Bitcoin Price Will Hit $...",1,"{'class-name': 'neutral', 'job-name': 'labelin...",neutral,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:48:38.131990,0.1779,positive
3,"""#BTC rallies to extend Elon Musk Tesla gains,...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:53:23.023535,0.4767,positive
4,Not everyone is impressed by the electric vehi...,0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.53,groundtruth/text-classification,yes,2022-09-09T16:50:38.609277,-0.3724,negative


In [44]:
sns.displot(df_normal, x="class-name")

AttributeError: module 'seaborn' has no attribute 'displot'

In [45]:
#Überprüfung der Ergebnisse aus dem Vader Sentiment Analyzer und der mit AWS Ground Truth gelabelten Daten
#Da es sich um zwei unabhängige Gruppen handelt => Ungepaarter t-Test

res = ttest_ind(df_normal['confidence'].values, df_normal['vader-compound'].values)

print(res)

Ttest_indResult(statistic=34.009126441624616, pvalue=5.637426042818657e-169)


In [10]:
summary, results = rp.ttest(group1= df_normal['confidence'][df_normal['class-name'] == 'positive'], group1_name= "groundtruth",
         group2= df_normal['vader-compound'][df_normal['vader-sentiment'] == 'positive'], group2_name= "vader",equal_variances = True, paired = False)

NameError: name 'df_normal' is not defined

In [58]:
print(summary)

      Variable      N      Mean        SD        SE  95% Conf.  Interval
0  groundtruth  266.0  0.691353  0.208565  0.012788   0.666175  0.716532
1        vader  191.0  0.447123  0.199694  0.014449   0.418621  0.475624
2     combined  457.0  0.589279  0.237565  0.011113   0.567440  0.611118


In [59]:
print(results)

                    Independent t-test   results
0  Difference (groundtruth - vader) =     0.2442
1                Degrees of freedom =   455.0000
2                                 t =    12.5673
3             Two side test p value =     0.0000
4            Difference < 0 p value =     1.0000
5            Difference > 0 p value =     0.0000
6                         Cohen's d =     1.1919
7                         Hedge's g =     1.1899
8                    Glass's delta1 =     1.1710
9                  Point-Biserial r =     0.5076


In [64]:
ttest_ind(df_normal['confidence'][df_normal['class-name'] == 'neutral'],
                df_normal['vader-compound'][df_normal['vader-sentiment'] == 'neutral'])

Ttest_indResult(statistic=56.16867677457272, pvalue=5.274211104566802e-179)

In [77]:
df_normal[df_normal['class-name'] != df_normal['vader-sentiment']]

Unnamed: 0,source,sentiment,sentiment-metadata,class-name,job-name,confidence,type,human-annotated,creation-date,vader-sentiment,vader-compound
0,"""""""It has to be part of the menu,"""" says @JimC...",2,"{'class-name': 'negative', 'job-name': 'labeli...",negative,labeling-job/tweets,-0.55,groundtruth/text-classification,yes,2022-09-09T16:49:52.147291,neutral,0.0000
1,"""This week, @intotheblock takes a dive into th...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.56,groundtruth/text-classification,yes,2022-09-09T16:48:11.991534,neutral,0.0000
2,"""Novogratz Predicts, #Bitcoin Price Will Hit $...",1,"{'class-name': 'neutral', 'job-name': 'labelin...",neutral,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:48:38.131990,positive,0.1779
4,Not everyone is impressed by the electric vehi...,0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.53,groundtruth/text-classification,yes,2022-09-09T16:50:38.609277,negative,-0.3724
5,"""#BTC rallies to extend Elon Musk Tesla gains,...",1,"{'class-name': 'neutral', 'job-name': 'labelin...",neutral,labeling-job/tweets,0.92,groundtruth/text-classification,yes,2022-09-09T16:50:21.849803,positive,0.4767
...,...,...,...,...,...,...,...,...,...,...,...
485,"""@PeterMcCormack @elonmusk @michael_saylor @Ba...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.44,groundtruth/text-classification,yes,2022-09-09T16:54:14.675203,negative,-0.2840
489,Should I do next giveaway in #Bitcoin ? Comment.,2,"{'class-name': 'negative', 'job-name': 'labeli...",negative,labeling-job/tweets,-0.95,groundtruth/text-classification,yes,2022-09-09T16:49:52.146579,neutral,0.0000
490,"""If all special funds were to allocate the ful...",2,"{'class-name': 'negative', 'job-name': 'labeli...",negative,labeling-job/tweets,-0.92,groundtruth/text-classification,yes,2022-09-09T16:48:38.132308,positive,0.4019
493,"""Latest for financial advisers: #ESG, #Bitcoin...",0,"{'class-name': 'positive', 'job-name': 'labeli...",positive,labeling-job/tweets,0.95,groundtruth/text-classification,yes,2022-09-09T16:52:55.709167,neutral,0.0000


In [76]:
print(df_normal[df_normal['class-name'] != df_normal['vader-sentiment']].source)

0      """It has to be part of the menu,"" says @JimC...
1      "This week, @intotheblock takes a dive into th...
2      "Novogratz Predicts, #Bitcoin Price Will Hit $...
4      Not everyone is impressed by the electric vehi...
5      "#BTC rallies to extend Elon Musk Tesla gains,...
                             ...                        
485    "@PeterMcCormack @elonmusk @michael_saylor @Ba...
489     Should I do next giveaway in #Bitcoin ? Comment.
490    "If all special funds were to allocate the ful...
493    "Latest for financial advisers: #ESG, #Bitcoin...
494    Gold Year To Date:  -6.2% #Bitcoin Year To Dat...
Name: source, Length: 306, dtype: string


In [81]:
HTML('<br />'.join(str(y) for y in df_normal[(df_normal['class-name'] == 'negative') & (df_normal['vader-sentiment'] == 'positive')].source))