# Baseline Modelle

In [1]:
#Laden der benötigten Bibliotheken
import pandas as pd
import numpy as np
import nltk   #Natural Language ToolKit
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Daten einlesen


In [2]:
data = pd.read_csv("")

In [3]:
data.head()

Unnamed: 0,id,created_at,username,name,user_id,body,basic_sentiment,symbol0,symbol1,symbol2,symbol3,symbol4
0,410821194,2021-11-28 12:28:21+00:00,chuckygorman,Chuck Gorman,1068147,$AAPL $BA $DIS,0,AAPL,BA,DIS,,
1,410821004,2021-11-28 12:25:13+00:00,Renohawk,jessie livermore,162218,$SPY $TSLA $MRNA $AAPL $NVDA \n\nMy thoughts o...,0,AAPL,MRNA,NVDA,SPY,TSLA
2,410820720,2021-11-28 12:19:36+00:00,newtwo99,PS not a bot!,813188,$AAPL $BA $DIS $UNH,0,AAPL,BA,DIS,UNH,
3,410819818,2021-11-28 12:02:00+00:00,ChartMill,ChartMill,47688,The industry average Profit Margin is 4.39%. $...,0,AAPL,,,,
4,410819568,2021-11-28 11:56:23+00:00,OptionsPlayers,Steve /OptionsDayTrader77,619769,$AAPL $BA $DIS On the new Covid variant -\n\n&...,1,AAPL,BA,DIS,,


## Pre-Processing

Da Vader kaum Pre-Processing benötigt und der sklearn TfidfVectorizer den Großteil des Pre-Processings auch erledigt, entfallen viele Pre-Processing Schritte.

In [4]:
#html tags und urls raus
from bs4 import BeautifulSoup
data["tweets_preproc"]=data["body"].apply(lambda x: BeautifulSoup(x).get_text())
import re
data["tweets_preproc"]=data["tweets_preproc"].apply(lambda x: re.sub(r"http\S+", "", x))

In [5]:
#sonderzeichen etc. raus
#Ressourcen downloaden
#nltk.download('punkt')
#sonderzeichen etc. raus
data["tweets_preproc"]=data["tweets_preproc"].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))

In [6]:
#überschüssige leerzeichen raus
data["tweets_preproc"]=data["tweets_preproc"].apply(lambda x: re.sub(' +', ' ', x))

In [7]:
#Prüfen, ob alles geklappt hat
data.head()

Unnamed: 0,id,created_at,username,name,user_id,body,basic_sentiment,symbol0,symbol1,symbol2,symbol3,symbol4,tweets_preproc
0,410821194,2021-11-28 12:28:21+00:00,chuckygorman,Chuck Gorman,1068147,$AAPL $BA $DIS,0,AAPL,BA,DIS,,,AAPL BA DIS
1,410821004,2021-11-28 12:25:13+00:00,Renohawk,jessie livermore,162218,$SPY $TSLA $MRNA $AAPL $NVDA \n\nMy thoughts o...,0,AAPL,MRNA,NVDA,SPY,TSLA,SPY TSLA MRNA AAPL NVDA My thoughts on the st...
2,410820720,2021-11-28 12:19:36+00:00,newtwo99,PS not a bot!,813188,$AAPL $BA $DIS $UNH,0,AAPL,BA,DIS,UNH,,AAPL BA DIS UNH
3,410819818,2021-11-28 12:02:00+00:00,ChartMill,ChartMill,47688,The industry average Profit Margin is 4.39%. $...,0,AAPL,,,,,The industry average Profit Margin is AAPL out...
4,410819568,2021-11-28 11:56:23+00:00,OptionsPlayers,Steve /OptionsDayTrader77,619769,$AAPL $BA $DIS On the new Covid variant -\n\n&...,1,AAPL,BA,DIS,,,AAPL BA DIS On the new Covid variant It prese...


In [9]:
#Datensatz nur mit gelabelten Tweets
data_labeled = data.drop(data[data.basic_sentiment == 0].index)
#nicht gelabelte Tweets für spätere Anwendungen
data_unlabeled = data.drop(data[data.basic_sentiment != 0].index)

## Sentiment Analysis

### 1 - VADER

Vorteile: </br>
- Braucht kein Training
- Funktioniert ohne gelabelte Daten
- speziell für Social Media (unterscheidet z.B. TOLL - toll - toll!)

Nachteile: </br>
- Liefert teilweise keine optimalen Ergebnisse
- Hat drei Klassen (neg/neutr/pos), Tweets haben nur zwei

In [10]:
#download vader lexikon (scheint in nltk zu fehlen (vllt. auch Problem mit anderem vader?))
#nltk.download('vader_lexicon')
#nltk vader
sid = SentimentIntensityAnalyzer()
#model
data_labeled["v_scores"] = data_labeled["tweets_preproc"].apply(lambda body: sid.polarity_scores(body))

In [11]:
#eigene Spalte für scores erstellen
data_labeled['compound']  = data_labeled['v_scores'].apply(lambda score_dict: score_dict['compound'])

In [12]:
#Predictions
data_labeled['v_sentiment'] = (
    np.where(
        data_labeled['compound'] > 0.05,1,
        np.where(data_labeled['compound'] < -0.05, -1, 0)))

In [14]:
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(data_labeled["basic_sentiment"], data_labeled["v_sentiment"])

array([[1355, 1999, 1340],
       [   0,    0,    0],
       [2715, 8743, 6748]], dtype=int64)

In [13]:
data_labeled["v_sentiment"].value_counts()

 0    10742
 1     8088
-1     4070
Name: v_sentiment, dtype: int64

### 2 - TF - IDF

Term-frequency - inverse document-frequency: $tf-idf(t,d) = tf(t,d) * idf(t)$ </br>
tf = Anzahl eines Wortes $t$ in einem Tweet/Dokument </br>
$idf(t) = log\frac{n}{1+df(t)}$ (sklearn weicht leicht von Standardformulierung ab)</br>
mit $n$ = Anzahl aller Tweets im Datensatz, $df(t)$ = Anzahl aller Tweets, die Wort $t$ enthalten. </br>
Tf-idf wird zusätzlich normalisiert: </br>
$v_{norm} = \frac{v}{||v||_{2}} = \frac{v}{\sqrt{v_{1}² + v_{2}² + ... + v_{n}²}}$ </br>

In [19]:
#train / test split mit gleichen Anteilen der Klassen in Trainings- und Testdaten
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(data_labeled["tweets_preproc"], data_labeled["basic_sentiment"],
                                                  stratify = data_labeled["basic_sentiment"],
                                                  test_size=0.25, random_state=42)
print("Training Set: ",X_train.shape ,Y_train.shape, "Test Set: ",(X_test.shape,Y_test.shape))

Training Set:  (17175,) (17175,) Test Set:  ((5725,), (5725,))


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
#Anwendung auf Training- und Testdaten
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [21]:
tf_x_train

<17175x13974 sparse matrix of type '<class 'numpy.float64'>'
	with 232550 stored elements in Compressed Sparse Row format>

In [46]:
#pd.DataFrame.sparse.from_spmatrix(tf_x_train)

In [22]:
tf_x_test

<5725x13974 sparse matrix of type '<class 'numpy.float64'>'
	with 74767 stored elements in Compressed Sparse Row format>

In [39]:
#SVM classifier
#ohne grid search
from sklearn import svm
svm = svm.SVC()

In [40]:
#fitten
svm.fit(tf_x_train, Y_train)

SVC()

In [41]:
#predictions
y_fit_svm_base = svm.predict(tf_x_test)

In [42]:
#ergebnisse /modellgüte
from sklearn.metrics import classification_report
svm_results_base = classification_report(Y_test, y_fit_svm_base, output_dict = True)

In [43]:
svm_results_base

{'-1': {'precision': 0.7736318407960199,
  'recall': 0.2651321398124467,
  'f1-score': 0.39492063492063495,
  'support': 1173},
 '1': {'precision': 0.8380612436595904,
  'recall': 0.9800087873462214,
  'f1-score': 0.9034936708860759,
  'support': 4552},
 'accuracy': 0.8335371179039301,
 'macro avg': {'precision': 0.8058465422278052,
  'recall': 0.6225704635793341,
  'f1-score': 0.6492071529033554,
  'support': 5725},
 'weighted avg': {'precision': 0.8248602498501636,
  'recall': 0.8335371179039301,
  'f1-score': 0.7992917195869559,
  'support': 5725}}

In [34]:
#Grid Search
from sklearn.model_selection import GridSearchCV

model_svm = svm.SVC()
param_grid = [
  {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-3, 2, 6), 'kernel': ['rbf']},
 ]

grid_search = GridSearchCV(model_svm, param_grid, n_jobs = -1, verbose = 2)
grid_search.fit(tf_x_train, Y_train)

Fitting 5 folds for each of 41 candidates, totalling 205 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                          'kernel': ['rbf']}],
             verbose=2)

In [35]:
print(grid_search.best_params_)

{'C': 10.0, 'gamma': 1.0, 'kernel': 'rbf'}


In [36]:
#Bestes Modell auf Testdaten anwenden und fitted values speichern
model_svm = grid_search.best_estimator_
y_fit_svm = model_svm.predict(tf_x_test)

In [37]:
#ergebnisse /modellgüte
svm_results=classification_report(Y_test, y_fit_svm,output_dict=True)

In [38]:
svm_results

{'-1': {'precision': 0.7271341463414634,
  'recall': 0.40664961636828645,
  'f1-score': 0.5215965008201203,
  'support': 1173},
 '1': {'precision': 0.8626948115999211,
  'recall': 0.960676625659051,
  'f1-score': 0.9090531129820185,
  'support': 4552},
 'accuracy': 0.8471615720524017,
 'macro avg': {'precision': 0.7949144789706923,
  'recall': 0.6836631210136688,
  'f1-score': 0.7153248069010694,
  'support': 5725},
 'weighted avg': {'precision': 0.8349196744212014,
  'recall': 0.8471615720524017,
  'f1-score': 0.8296668062456157,
  'support': 5725}}

   precision = tp / (tp + fp) </br>
   recall / sensitivity = tp / (tp + fn) </br>
   F1 = 2 * (precision * recall) / (precision + recall)

In [29]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier
#Base model
rf = RandomForestClassifier(random_state = 42)

In [30]:
#fit base model
rf.fit(tf_x_train, Y_train)

RandomForestClassifier(random_state=42)

In [31]:
#predictions
y_fit_rf = rf.predict(tf_x_test)

In [32]:
#ergebnisse /modellgüte
rf_results=classification_report(Y_test, y_fit_rf,output_dict=True)

In [33]:
rf_results

{'-1': {'precision': 0.7417061611374408,
  'recall': 0.2668371696504689,
  'f1-score': 0.3924764890282132,
  'support': 1173},
 '1': {'precision': 0.8378276447293984,
  'recall': 0.9760544815465729,
  'f1-score': 0.9016742770167427,
  'support': 4552},
 'accuracy': 0.8307423580786026,
 'macro avg': {'precision': 0.7897669029334196,
  'recall': 0.6214458255985209,
  'f1-score': 0.647075383022478,
  'support': 5725},
 'weighted avg': {'precision': 0.8181332342047931,
  'recall': 0.8307423580786026,
  'f1-score': 0.797344319757259,
  'support': 5725}}

### Zusammenfassung:

    VADER: funtkioniert nicht gut
    TF_IDF: liefert für positive Tweets gute Ergebnisse; negative Tweets werden nicht gut erkannt

ToDo: </br>
pre-processing korrekt? </br>
Ergebnisse schöner darstellen


