<a href="https://colab.research.google.com/github/giopnd/notebooks/blob/master/textClsfLSTM1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!pip install tensorflow

import csv

# check english lexicon
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import (
    wordnet,
    stopwords
)

# handle regular expressions
import re

# handle data
import pandas as pd
import numpy as np

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, SpatialDropout1D, Activation;

#import libraries for visualization
import matplotlib.pyplot as plt

print(tf.__version__)

In [0]:
# reset data structures
articles = []
labels = []

In [0]:
import os
os.environ['KAGGLE_USERNAME'] = "georgiosgiotis"
os.environ['KAGGLE_KEY'] = "78e14d9a6090bb989f7240761e76185b"
# Colab library to upload files to notebook
from google.colab import files
# Install Kaggle library
!pip install -q kaggle
# Downlaod data
!kaggle datasets download -d kazanova/sentiment140
# unzip
!unzip "sentiment140.zip"

In [0]:
with open("training.1600000.processed.noemoticon.csv", 'r', encoding="latin1") as csvfile:
  reader = csv.reader(csvfile, delimiter=',')
  next(reader)
  try:
    for row in reader:
      labels.append(int(row[0])/2)
      articles.append(row[5])
  except UnicodeDecodeError:
    pass

print(len(articles))
print(len(labels))

In [0]:
step = 4
articles_df = pd.DataFrame(articles[::step])
labels_df = pd.DataFrame(labels[::step])

print(len(articles_df))
print(len(labels_df))

In [0]:
# data cleaning
def preprocessing_text(df):
  # lowercase
  df[0] = df[0].str.lower()
  # remove retweets
  df[0] = df[0].str.replace('rt', '')
  # remove mentions
  df[0] = df[0].replace(r'@\w+', '', regex=True)
  # remove links
  df[0] = df[0].replace(r'http\S+', '', regex=True)
  df[0] = df[0].replace(r'www.[^ ]+', '', regex=True)
  # remove numbers
  df[0] = df[0].replace(r'[0-9]+', '', regex=True)
  # remove special characters and puntuation marks
  df[0] = df[0].replace(r'[!"#$%&()*+,-./:;<=>?@[\]^_`{|}~]', '', regex=True)
  return df

articles_df = preprocessing_text(articles_df)
print(articles_df[:10])

In [0]:
nltk.download('wordnet')

# replace elongated words
def in_dict(word):
  if wordnet.synsets(word):
    return True

def replace_elongated_word(word):
  regex = r'(\w*)(\w+)\2(\w*)'
  repl = r'\1\2\3'
  if in_dict(word):
    return word
  new_word = re.sub(regex, repl, word)
  if new_word != word:
    return replace_elongated_word(new_word)
  else:
    return new_word

def detect_elongated_words(row):
  regexrep = r'(\w*)(\w+)(\2)(\w*)'
  words = [''.join(i) for i in re.findall(regexrep, row)]
  for word in words:
    if not in_dict(word):
      row = re.sub(word, replace_elongated_word(word), row)
  return row

articles_df[0] = articles_df[0].apply(lambda x: detect_elongated_words(x))
print(articles_df[:10])

In [0]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# handle negation with antonyms
def replace_antonyms(word):
  # get all lemma for word
  for syn in wordnet.synsets(word):
    for lemma in syn.lemmas():
      # if the lemma is an antonyms of word
      if lemma.antonyms():
        # return antonym
        return lemma.antonyms()[0].name()
  return word

def handling_negation(row):
  words = word_tokenize(row)
  speach_tags = ['JJ', 'JJR', 'JJS', 'NN', 'VB', 'VBD', 'VBG', 'VBN', 'VBP']
  # obtain the type of words
  tags = nltk.pos_tag(words)
  # ask if we found a negation in words
  tags_2 = ''
  if "n't" in words and "not" in words:
    tags_2 = tags[min(words.index("n't"), words.index("not")):]
    words_2 = words[min(words.index("n't"), words.index("not")):]
    words = words[:(min(words.index("n't"), words.index("not")))+1]
  elif "n't" in words:
    tags_2 = tags[words.index("n't"):]
    words_2 = words[words.index("n't"):]
    words = words[:words.index("n't")+1]
  elif "not" in words:
    tags_2 = tags[words.index("not"):]
    words_2 = words[words.index("not"):]
    words = words[:words.index("not")+1]
  for index, word_tag in enumerate(tags_2):
    if word_tag[1] in speach_tags:
      words = words+[replace_antonyms(word_tag[0])]+words_2[index+2:]
      break
  return ' '.join(words)

articles_df[0] = articles_df[0].apply(lambda x: handling_negation(x))

In [0]:
nltk.download('stopwords')

# remove stop words
def stop_words(df):
  stop_words_list = stopwords.words('english')
  #df[0] = df[0].str.lower()
  df[0] = df[0].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words_list)]))
  return df

articles_df = stop_words(articles_df)
print(articles_df[:10])

In [0]:
# pretty print df
with pd.option_context('display.max_rows', 10, 'display.max_columns', None, 'display.max_colwidth', 2000):
  print(articles_df.to_string())

In [0]:
# split data into training and test dataset
def split(dfd, dfl):
  x_train, x_test, y_train, y_test = train_test_split(dfd, dfl, test_size=0.2, shuffle=True)
  return x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = split(articles_df[0], labels_df[0])

print(len(x_test))

In [0]:
# convert the collection of tweets to a matrix of tf/idf features
def tokenize_tweets(dataset, features):
  tokenization = TfidfVectorizer(max_features=features)
  tokenization.fit(dataset)
  dataset_transformed = tokenization.transform(dataset).toarray()
  return dataset_transformed

In [0]:
# create the neural network model
def train(x_train_mod, y_train, features, shuffle, drop, layer1_input, layer2_input, epochs, lr, epsilon, validation):
  model_nn = Sequential()
  model_nn.add(Dense(layer1_input, input_shape=(features,), activation='relu'))
  model_nn.add(Dropout(drop))
  model_nn.add(Dense(layer2_input, activation='sigmoid'))
  model_nn.add(Dropout(drop))
  model_nn.add(Dense(3, activation='softmax'))

  optimizer = keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=epsilon, decay=0.0, amsgrad=False)
  model_nn.compile(loss='sparse_categorical_crossentropy',
                   optimizer=optimizer,
                   metrics=['accuracy'])
  model_nn.fit(np.array(x_train_mod), y_train,
               batch_size=32,
               epochs=epochs,
               verbose=1,
               validation_split=validation,
               shuffle=shuffle)
  return model_nn  

In [22]:
def model1(x_train, y_train):
  features = 3500
  shuffle = True
  drop = 0.5
  layer1_input = 512
  layer2_input = 256
  epochs = 6
  lr = 0.001
  epsilon = None
  validation = 0.1
  x_train_mod = tokenize_tweets(x_train, features)
  model = train(x_train_mod, y_train, features, shuffle, drop, layer1_input, layer2_input, epochs, lr, epsilon, validation)
  return model

model = model1(x_train, y_train)

Train on 288000 samples, validate on 32000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [0]:
model.summary()
model.to_json()

In [0]:
x_new = tokenize_tweets(x_train, 3500)
new_prediction = model.predict(x_new)

labels = ['Negative', 'Neutral', 'Positive']
sentiments = [labels[np.argmax(pred)] for pred in new_prediction]
#tweet_table_new["sentiment"] = sentiments

sizes = [sentiments.count('Negative'), sentiments.count('Neutral'), sentiments.count('Positive')]
explode = (0, 0, 0.1)
labels = 'Negative', 'Neutral', 'Positive'
plt.figure(figsize=(5,5))
plt.pie(sizes, explode=explode, colors="bwr", labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90, wedgeprops={'alpha':0.8})
plt.axis('equal')
plt.show()

In [0]:
print(model.predict(tokenize_tweets(x_test, 3500)))