In [37]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.stem import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re

from gensim.corpora import Dictionary
from gensim.models import LdaModel
from sklearn.model_selection import train_test_split
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

## Task 1

In [3]:
data = pd.read_csv('C:/Users/felix/Documents/TU Dortmund/Text as Data/Sheet7/data.csv')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data["review"], data["sentiment"], test_size=0.25, random_state=42)

In [51]:
print(type(X_test))

<class 'pandas.core.series.Series'>


## Task 2

In [7]:
def single_preprocessing(document):
    """
    This function performs for a single text all preprocessing steps
    parameter: text as string
    output: list of that contains the preprocessed tokens
    """
    
    # preprosessing of the stopwords:
    lemmatizer = WordNetLemmatizer()
    
    stop_words = list(set(stopwords.words('english')))
    
    preprocessed_stop_words = []
    
    for k in np.arange(len(stop_words)):
        text= ""
        for i in np.arange(len(stop_words[k])):
            if stop_words[k][i].isalpha() or stop_words[k][i].isspace():
                text += stop_words[k][i].lower()
        text = lemmatizer.lemmatize(text)
        preprocessed_stop_words.append(text)
    
    # preprosessing of the actual text
    
    # list containing the regexes to clean the texts for not wanted pattern
    
    # list to save the preprocessed text
    preprocessed_document = []

    # remove non-alphabetical chars except space and split the text into a list of tokens
    text= ""
    for i in np.arange(len(document)):
        if document[i].isalpha() or document[i].isspace():
            text += document[i].lower()
    text_split = text.split()

    # lemmatize each token and remove it if it is part of the stopwords
    #print("Before:",len(text_split))
    for i in text_split:
        i = lemmatizer.lemmatize(i)
        if i not in preprocessed_stop_words:
            preprocessed_document.append(i)
            
    #print("After:",len(preprocessed_document))
    
    return preprocessed_document

In [8]:
def nested_preprocessing(corpus):
    """
    This function performs for a list of texts all preprocessing steps
    parameter: list of texts
    output: list of lists containing where each contained lists contains the preprocessed tokens
    """
    
    preprocessed_corpus = [single_preprocessing(k) for k in corpus]

    return preprocessed_corpus

In [9]:
X_train_preprocessed = nested_preprocessing(X_train.to_list())

In [10]:
X_test_preprocessed = nested_preprocessing(X_test.to_list())

## Task 3

In [11]:
X_train_tagged = [TaggedDocument(doc, [i]) for i, doc in enumerate(X_train_preprocessed)]

In [12]:
print(X_train_tagged[:10])

[TaggedDocument(words=['figure', 'alternate', 'reality', 'teen', 'flickmore', 'precisely', 'ferris', 'bueller', 'type', 'character', 'leader', 'cheat', 'ring', 'yeah', 'know', 'meant', 'compared', 'ferris', 'bueller', 'least', 'orangestooranges', 'way', 'nonethelessbr', 'br', 'bottomline', 'galaxy', 'away', 'even', 'even', 'minor', 'classic', 'watchable', 'though', 'expecting', 'much', 'said', 'main', 'character', 'charm', 'premise', 'wear', 'thin', 'writing', 'clever', 'movie', 'deliver', 'enough', 'laugh', 'twist', 'tension', 'keep', 'interest', 'br', 'br', 'honest', 'continue', 'watchingwatching', 'hope', 'see', 'anything', 'suddenly', 'clicked', 'stylish', 'recommend', 'movie', 'btw', 'seems', 'odd', 'see', 'mary', 'tyler', 'moore', 'principal', 'truly', 'miscast', 'hope', 'paycheck', 'inordinately', 'big'], tags=[0]), TaggedDocument(words=['kind', 'movie', 'want', 'good', 'suck', 'first', 'thing', 'hell', 'punk', 'trying', 'school', 'think', 'kid', 'seem', 'realize', 'gravity', 's

In [13]:
model = Doc2Vec(vector_size=100, window=5, epochs=1)

In [14]:
model.build_vocab(X_train_tagged)

In [15]:
model.train(X_train_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [16]:
vector = model.infer_vector(X_train_preprocessed[0])
print(vector)

[ 0.04145692 -0.00565632  0.02075837  0.02224232 -0.01171277 -0.0307511
  0.03952111  0.0463496  -0.00518927 -0.01438788  0.0129357  -0.00716421
 -0.00222433  0.00144624  0.00865692  0.0129415   0.01924692 -0.01864611
  0.01343995 -0.05659904  0.03036925 -0.01120047  0.02603097  0.0167676
  0.0020928   0.0243952  -0.0316448  -0.03207208 -0.01274072 -0.00356369
 -0.00885206 -0.00875334  0.04403827 -0.01760861  0.0308297   0.01579852
 -0.00995274 -0.008655   -0.01449026 -0.01495527 -0.00687278  0.03317825
  0.00052388 -0.01839344  0.01201198 -0.01223186 -0.0439086  -0.00844268
  0.01295803  0.03021282  0.0280003   0.03259393 -0.00434075 -0.01242853
 -0.00465453 -0.00948307  0.01433225 -0.02618631 -0.03318497 -0.01225872
 -0.01180308  0.00276462  0.00507419 -0.00729723 -0.02002504  0.0210344
  0.00888676  0.03964304 -0.01209769  0.00197799 -0.00405097  0.03038548
  0.01193575 -0.0134522   0.02560717 -0.00772716  0.02457949 -0.00699022
 -0.02493479 -0.02401142 -0.00986655  0.01176244  0.01

In [17]:
X_train_embedding = [model.infer_vector(X_train_preprocessed[i]) for i, doc in enumerate(X_train)]

In [18]:
X_train_final = np.stack(X_train_embedding)

In [19]:
X_train_final.shape

(37500, 100)

In [20]:
clf = LogisticRegression(random_state=0).fit(X_train_final, y_train)

In [21]:
X_test_embedding = [model.infer_vector(X_test_preprocessed[i]) for i, doc in enumerate(X_test)]

In [22]:
X_test_final = np.stack(X_test_embedding)

In [23]:
y_pred = clf.predict(X_test_final)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.75      0.73      0.74      6157
    positive       0.75      0.76      0.75      6343

    accuracy                           0.75     12500
   macro avg       0.75      0.75      0.75     12500
weighted avg       0.75      0.75      0.75     12500



## Task 5

In [29]:
wkwsci = pd.read_excel('C:/Users/felix/Documents/TU Dortmund/Text as Data/Sheet7/WKWSCI.xlsx')

In [30]:
print(wkwsci.head())

     term  POS  sentiment
0       a  det          0
1    a.d.  adv          0
2  a.k.a.  adv          0
3    a.m.  adj          0
4    a.m.  adv          0


In [31]:
wkwsci_word_list = wkwsci['term'].to_list()

In [None]:
wkwsci["term_preprocessed"] = word_preprocessed

In [47]:
wkwsci_dict = dict(zip(wkwsci.term, wkwsci.sentiment))

In [None]:
sentiment_scores = []

for i in np.arange(len(X_test)):
    X_test_list = X_test.to_list()
    text_split = X_test_list[i].split()
    sentiment_score = 0
    for k in np.arange(len(text_split)):
        
        for key, value in wkwsci_dict.items():
            if text_split[k] == key:
                sentiment_score+=value
    sentiment_scores.append(sentiment_score)  


In [None]:
y_pred_sent = ['positive' if score >0 else 'negative' for score in sentiment_scores]

In [None]:
print(classification_report(y_test, y_pred_sent))