# Anwendungsbeispiel Import of text data with sentiment classification

In diesem Beispiel werden wir Textdaten behandeln und versuchen die Stimmung eines kurzen Stückes Text zu bestimmen. Damit können zum Beispiel eMails oder Social Media Beiträge gefiltert werden.





- [2] [https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/](https://stackabuse.com/python-for-nlp-movie-sentiment-analysis-using-deep-learning-in-keras/)
- [3] https://gdcoder.com/sentiment-clas/
- [4] [https://nlp.stanford.edu/pubs/glove.pdf](https://nlp.stanford.edu/pubs/glove.pdf)


Zitierungen:
```
Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. GloVe: Global Vectors for Word Representation.

```









# Import der Module

In [164]:
#
# Import der Module
#
import os
import re
import string
from urllib.request import urlretrieve
import tarfile
import zipfile
from glob import glob

import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense, SpatialDropout1D
from keras.layers import Flatten, LSTM
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

In [165]:
#
# Abdrehen von Fehlermeldungen
#
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=Warning)

In [166]:
#
# Für GPU Support
#
import tensorflow as tf
print ( tf.__version__ ) 

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR )
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

2.0.0


# Konstanten

In [167]:
#
# Konstanten für Dateien
#
urlDataSource = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
localExtractionFolder = 'data/moviereviews'
localDataArchive = localExtractionFolder + '/aclImdb_v1.tar.gz'
textData = localExtractionFolder + '/aclImdb/'

# Hilfsfunktionen

In [168]:
#
# Laden der Daten von einer URL
#
def download_dataset(url,dataset_file_path,extraction_directory):
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)
    if os.path.exists(dataset_file_path):
        print("archive already downloaded.")
    else:
        print("started loading archive from url {}".format(url))
        filename, headers = urlretrieve(url, dataset_file_path)
        print("finished loading archive from url {} to {}".format(url,filename))

def extract_dataset(dataset_file_path, extraction_directory):
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)
    if (dataset_file_path.endswith("tar.gz") or dataset_file_path.endswith(".tgz")):
        tar = tarfile.open(dataset_file_path, "r:gz")
        tar.extractall(path=extraction_directory)
        tar.close()
    elif (dataset_file_path.endswith("tar")):
        tar = tarfile.open(dataset_file_path, "r:")
        tar.extractall(path=extraction_directory)
        tar.close()
    print("extraction of dataset from {} to {} done.".format(dataset_file_path,extraction_directory) )


# Laden und erster Check

In [169]:
#
# Laden der Daten ausführen
#
download_dataset(urlDataSource,localDataArchive,localExtractionFolder)

archive already downloaded.


In [170]:
#
# Extrahieren der Daten
#
extract_dataset(localDataArchive,localExtractionFolder)

extraction of dataset from data/moviereviews/aclImdb_v1.tar.gz to data/moviereviews done.


# Wie sehen die Daten auf dem Filesystem aus?

In [None]:
#
# Sammeln der Daten aus den Files
#
def load_texts_labels_from_folders(path, folders):
    print('scanning path {}'.format(path))
    texts,labels = [],[]
    for idx,label in enumerate(folders):
        print('scanning {}'.format(idx))
        for fname in glob(os.path.join(path, label, '*.*')):
            texts.append(open(fname, 'r').read())
            labels.append(idx)
    return texts, np.array(labels).astype(np.int8)

In [None]:
#
# Laden der positiven und negativen Beispiele
#
classes = ['neg','pos']
x_train,y_train = load_texts_labels_from_folders( textData + 'train', classes)
x_test,y_test = load_texts_labels_from_folders( textData + 'test', classes)

In [None]:
len(x_train),len(y_train),len(x_test),len(y_test)

In [None]:
#
# Prüfen des Datentypen
#
(type(x_train),type(y_train))

In [None]:
#
# Prüfen der Klassen
#
np.unique(y_train)

In [None]:
#
# negative Beispiele
#
for index in range (0,1):
    print(x_train[index])
    print("label {}".format(y_train[index]))
    print()

In [None]:
#
# positive Beispiele
#
for index in range (13001,13002):
    print(x_train[index])
    print("label {}".format(y_train[index]))
    print()


# Zerlegen der Texte in Worte und Reinigung

In [None]:
#
# Reinigungsfunktion
#
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)
    return sentence

In [None]:
x_train_clean = []
for review in x_train:
    x_train_clean.append(preprocess_text(review))
    
x_test_clean = []
for review in x_test:
    x_test_clean.append(preprocess_text(review))  
    
x_test = x_test_clean
x_train = x_train_clean

In [None]:
for index in range (0,1):
    print(x_train[index])
    print("label {}".format(y_train[index]))
    print()

In [None]:
#
# Zählen der Längen der Texte
#
textLength = []
for index in range (0,len(x_train)):
    textLength.append(len(x_train[index]))

plt.hist(textLength)
lengthArray = np.array(textLength)
print('text character length mean {}'.format(np.mean(lengthArray)))

# Umwandeln der Worte in Vektoren

In [None]:
#
# Zerlegung der Sätze in Worte
#
tokenizer = Tokenizer(num_words=5000, lower=True, oov_token='unknwn')
tokenizer.fit_on_texts(x_train)

In [None]:
x_train_v = tokenizer.texts_to_sequences(x_train)
x_test_v = tokenizer.texts_to_sequences(x_test)

In [None]:
print(x_train_v[0])

In [None]:
#
# Zählen der Längen der vektorisierten Texte
#
textLength = []
for index in range (0,len(x_train_v)):
    textLength.append(len(x_train_v[index]))

plt.hist(textLength)
lengthArray = np.array(textLength)
print('vectorized length mean {}'.format(np.mean(lengthArray)))

In [None]:
#
#
#
vocab_size = len(tokenizer.word_index) + 1
print('count of words {}'.format(vocab_size))

In [None]:
maxlen = 200

x_train = pad_sequences(x_train_v, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test_v, padding='post', maxlen=maxlen)

# Umrechnung in einen dichten Vektorraum (glove)

In [None]:
gloveUrl = 'http://nlp.stanford.edu/data/glove.6B.zip'
gloveExtractionFolder = 'data/glove'
gloveDataArchive = gloveExtractionFolder + '/glove.6B.zip'
gloveData = gloveExtractionFolder + '/' + 'glove.6B.100d.txt'

gloveDims = 100

In [None]:
def unzip_dataset(dataset_file_path, extraction_directory):  
    if (not os.path.exists(extraction_directory)):
        os.makedirs(extraction_directory)        
    zip = zipfile.ZipFile(dataset_file_path)
    zip.extractall(path=extraction_directory)        
    print("extraction of dataset from {} to {} done.".format(dataset_file_path,extraction_directory) )

In [None]:
#
# Laden der Daten ausführen
#

if ( not os.path.exists(gloveData)):
    download_dataset(gloveUrl,gloveDataArchive,gloveExtractionFolder)

In [None]:
if ( not os.path.exists(gloveData)):
    unzip_dataset(gloveDataArchive,gloveExtractionFolder)

In [None]:
embeddings_dictionary = dict()
glove_file = open(gloveData, encoding="utf8")
for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [None]:
embedding_matrix = np.zeros((vocab_size, gloveDims))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
print(embedding_matrix.shape)

# Erstellen eines Modelles

In [None]:
def createNNModel():
    model = Sequential()
    embedding_layer = Embedding(vocab_size, gloveDims, weights=[embedding_matrix], input_length=maxlen , trainable=False)
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [None]:
model = createNNModel()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

In [None]:
history = model.fit(x_train, y_train, batch_size=128, epochs=12, verbose=1, validation_split=0.2)

In [None]:
score = model.evaluate(x_test, y_test, verbose=1)

In [None]:
print("test loss:", score[0])
print("test accuracy:", score[1])

In [None]:
def plotResults(history):
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
    plt.show()

    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])

    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train','test'], loc='upper left')
    plt.show()

In [None]:
plotResults(history)

# Verbessertes Modell

Hinweise für bessere Modelle gefunden auf Kaggle [https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras](https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras).

In [None]:
def createLSTMModel():
    model = Sequential()
    model.add(Embedding(vocab_size, gloveDims, weights=[embedding_matrix], input_length=maxlen , trainable=False))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(190, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return model

In [None]:
model = createLSTMModel()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

In [None]:
history = model.fit(x_train, y_train, batch_size=32, epochs=10, verbose=1, validation_split=0.2)
score = model.evaluate(x_test, y_test, verbose=1)

In [None]:
print("test loss:", score[0])
print("test accuracy:", score[1])

In [None]:
plotResults(history)

# Test mit neuen Daten

In [None]:
instance = x_test_clean[56]
print(instance)

In [None]:
def sentiment(text):
    
    instance = tokenizer.texts_to_sequences(text)
    flat_list = []
    for sublist in instance:
        for item in sublist:
            flat_list.append(item)

    flat_list = [flat_list]
    instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)
    sentiment = model.predict(instance)
    
    comment = 'meh'
    if sentiment > 0.85:
        comment = 'very good'
    elif sentiment > 0.75:
        comment = 'good'
    elif sentiment > 0.50:
        comment = 'moderate'
    return sentiment,comment

In [None]:
test1 = "I simply don't like this film."
print ( sentiment(test1))

In [None]:
test1 = "I hate this film."
print ( sentiment(test1))

# Weiterführende Schritte


Stimmungsanalyse für Deutsch [https://machine-learning-blog.de/2019/06/03/stimmungsanalyse-sentiment-analysis-auf-deutsch-mit-python/](https://machine-learning-blog.de/2019/06/03/stimmungsanalyse-sentiment-analysis-auf-deutsch-mit-python/)

Anleitung für Zugriff auf twitter API [https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/](https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/)

