<a href="https://colab.research.google.com/github/giopnd/notebooks/blob/master/sentiment11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

pd.options.mode.chained_assignment = None
import numpy as np
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec  # the word2vec model gensim class

LabeledSentence = gensim.models.doc2vec.LabeledSentence

from tqdm import tqdm

tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
import os
os.environ['KAGGLE_USERNAME'] = "georgiosgiotis"
os.environ['KAGGLE_KEY'] = "78e14d9a6090bb989f7240761e76185b"
# Colab library to upload files to notebook
from google.colab import files
# Install Kaggle library
!pip install -q kaggle
# Downlaod data
!kaggle datasets download -d kazanova/sentiment140
# unzip
!unzip "sentiment140.zip"

In [5]:
def load_dataset():
  data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', usecols=[0, 5],
                     header=None)
  data = data[data[0].isnull() == False]
  data[0] = data[0].map(int) / 4
  data = data[data[5].isnull() == False]
  data.reset_index(inplace=True)
  data.drop('index', axis=1, inplace=True)
  print('dataset loaded with shape', data.shape)
  return data


data = load_dataset()

dataset loaded with shape (1600000, 2)


In [0]:
def tokenize(tweet):
  try:
    #tweet = np.unicode(tweet.decode('latin-1').lower())
    tokens = tokenizer.tokenize(tweet)
    tokens = list(filter(lambda t: not t.startswith('@'), tokens))
    tokens = list(filter(lambda t: not t.startswith('#'), tokens))
    tokens = list(filter(lambda t: not t.startswith('http'), tokens))
    return tokens
  except:
    return 'NC'


def postprocess(data, n=1600000):
  data = data
  data['tokens'] = data[5].progress_map(tokenize)
  data = data[data.tokens != 'NC']
  data.reset_index(inplace=True)
  data.drop('index', inplace=True, axis=1)
  return data


data = postprocess(data)

In [0]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data['tokens']),
                                                    np.array(data[0]),
                                                    test_size=0.2, shuffle=True)

In [0]:
def labelizeTweets(tweets, label_type):
  labelized = []
  for i,v in tqdm(enumerate(tweets)):
    label = '%s_%s'%(label_type,i)
    labelized.append(LabeledSentence(v, [label]))
  return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

In [0]:
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

model = Word2Vec([x.words for x in x_train], min_count=1,size= 200,workers=3, window =3, sg = 1)

In [0]:
print(model.wv.most_similar('facebook'))

In [0]:
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print('vocab size :', len(tfidf))

def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

from sklearn.preprocessing import scale

n_dim=200
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in map(lambda x: x.words, x_train)])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in map(lambda x: x.words, x_test)])
test_vecs_w2v = scale(test_vecs_w2v)

In [0]:
def train(x_train_mod, y_train, features, shuffle, drop, layer1_input, layer2_input, epochs, lr, epsilon, validation):
  model_nn = Sequential()
  model_nn.add(Dense(layer1_input, input_shape=(features,), activation='relu'))
  model_nn.add(Dropout(drop))
  model_nn.add(Dense(layer2_input, activation='sigmoid'))
  model_nn.add(Dropout(drop))
  model_nn.add(Dense(2, activation='softmax'))

  optimizer = keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=epsilon, decay=0.0, amsgrad=False)
  model_nn.compile(loss='sparse_categorical_crossentropy',
                   optimizer=optimizer,
                   metrics=['accuracy'])
  model_nn.fit(np.array(x_train_mod), y_train,
               batch_size=32,
               epochs=epochs,
               verbose=1,
               validation_split=validation,
               shuffle=shuffle)
  return model_nn

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, SpatialDropout1D, Activation;

def model1(x_train, y_train):
  features = 2000
  shuffle = True
  drop = 0.5
  layer1_input = 256
  layer2_input = 128
  epochs = 10
  lr = 0.001
  epsilon = None
  validation = 0.1
  x_train_mod = train_vecs_w2v
  model = train(x_train_mod, y_train, features, shuffle, drop, layer1_input, layer2_input, epochs, lr, epsilon, validation)
  return model

model = model1(train_vecs_w2v, y_train)

In [0]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=3, batch_size=32, verbose=2)

In [0]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print(score[1])

In [0]:
print(len(train_vecs_w2v))
print(len(test_vecs_w2v))