In [None]:
%matplotlib inline

import pandas as pd
import numpy as np
import nltk
import sys

In [None]:
df1 = pd.read_csv('data/tweetset.csv', encoding="windows-1254")
df1.head()

In [None]:
df1.shape

In [None]:
df2 = pd.read_csv('data/TurkishTweets.csv')
df2.head()

# Prepare & Concatenate Datasets

In [None]:
df2.shape

In [None]:
df1.isnull().sum()

### Drop the unnecessary columns in 1. dataset

In [None]:
df1 = df1.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5'], axis = 1)
df1.isnull().sum()

In [None]:
df1.shape

In [None]:
df2.isnull().sum()

In [None]:
df2 = df2.dropna()
df2.isnull().sum()

### Change labels in datasets

In [None]:
df2['Etiket'].unique()

In [None]:
df2["Etiket"].replace({"kızgın": "Negative", "korku": "Negative", "mutlu": "Positive", 
                        "surpriz": "Positive", "üzgün": "Negative"}, inplace=True)
df2['Etiket'].unique()

In [None]:
df2.columns = ['Tweets', 'Sentiment']
df2.head(5)

In [None]:
df1["Tip"].replace({"Pozitif": "Positive", " Negatif": "Negative", "Negatif": "Negative" }, inplace=True)
df1['Tip'].unique()

In [None]:
df1.isnull().sum()
df1.head(5)

In [None]:
df1 = df1.reindex(columns=['Paylaşım','Tip'])
df1.head()

In [None]:
df1.columns = ['Tweets', 'Sentiment']
df1.head(5)

### Concatenate datasets

In [None]:
df = df1.append(df2, ignore_index=True)
df.head(5)

In [None]:
df.shape

# Text Preprocessing

In [None]:
import re

### Convert to lower case

In [None]:
df['Tweets'] = [token.lower() for token in df['Tweets']]
df.head(5)

### Remove @ mentions and hyperlinks

In [None]:
found = df[df['Tweets'].str.contains('@')]
found.count()

In [None]:
df.info()

In [None]:
df['Tweets'] = df['Tweets'].replace('@[A-Za-z0-9]+', '', regex=True).replace('@[A-Za-z0-9]+', '', regex=True)
found = df[df['Tweets'].str.contains('@')]
found.count()

In [None]:
found = df[df['Tweets'].str.contains('http')]
found.count()

In [None]:
df['Tweets'] = df['Tweets'].replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
found = df[df['Tweets'].str.contains('http')]
found.count()

In [None]:
df.shape

### Remove Punctations & Emojies & Numbers

In [None]:
sentences = df['Tweets'].copy()
new_sent = []
i = 0
for sentence in sentences:
    new_sentence = re.sub('[0-9]+', '', sentence)
    new_sent.append(new_sentence)
    i += 1
    
df['Tweets'] = new_sent
df['Tweets'].head(5)

In [None]:
import string

table = str.maketrans('', '', string.punctuation)
sentences = df['Tweets'].copy()
new_sent = []
for sentence in sentences:
    words = sentence.split()
    stripped = [w.translate(table) for w in words]
    new_sent.append(stripped)

In [None]:
df['Tweets'] = new_sent
df['Tweets'].head(5)

# Zemberek-NLP

## Tokenization

In [None]:
import time
import logging

from zemberek import (
    TurkishSpellChecker,
    TurkishSentenceNormalizer,
    TurkishSentenceExtractor,
    TurkishMorphology,
    TurkishTokenizer
)

logger = logging.getLogger(__name__)

morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)
extractor = TurkishSentenceExtractor()

### Sentence Normalization

In [None]:
def normalize_long_text(text):
    normalized_sentences = [normalizer.normalize(word) for word in text]
    normalized_text = " ".join(normalized_sentences)
    return normalized_text

In [None]:
sentences = df['Tweets'].copy()
new_sent = []
start = time.time()

for token in sentences:   
    if token.count('') > 0:
        token = list(filter(('').__ne__, token))
    new_token = normalize_long_text(token)
    new_sent.append(new_token)

logger.info(f"Sentences normalized in: {time.time() - start} s")

### Stopwords

In [None]:
from nltk.corpus import stopwords
import re

stops = set(stopwords.words('turkish'))
print(stops)

In [None]:
splitted_words = []
for sent in new_sent:
    words = sent.split()
    splitted_words.append(words)

In [None]:
clean_sent = []
for sentence in splitted_words:
    new_sentence = [w for w in sentence if w not in stops]
    clean_sent.append(new_sentence)

### Lemmatization

In [None]:
for token in clean_sent:
    j = 0
    for word in token:
        new_word = word.replace('"', '').replace("’", '').replace("'", '').replace("”", '')
        token[j] = new_word
        j += 1

In [None]:
import zeyrek

analyzer = zeyrek.MorphAnalyzer()
lem_sent = []
for sent in clean_sent:
    normalized_sent = []
    for word in sent:
        if word == '':
            continue
        else:
            lem_word = analyzer.lemmatize(word)
            normalized_sent.append(lem_word[0][1][0])
    lem_sent.append(normalized_sent)

In [None]:
x = lem_sent.copy()
for sent in x:
    i = 0
    for token in sent:
        sent[i] = token.lower()
        i += 1
lem_sent = x

In [None]:
lem_sent = list(filter(('').__ne__, lem_sent))

In [None]:
df['Tweets'] = lem_sent
df['Tweets'].head(5)

In [None]:
df['Tweets'] = df.Tweets.apply(' '.join)

### Remove Rare Words

In [None]:
freq = pd.Series(' '.join(df['Tweets']).split()).value_counts()
less_freq = list(freq[freq == 1].index)

In [None]:
df['Tweets'] = df['Tweets'].apply(lambda x: " ".join(x for x in x.split() if x not in less_freq))
df['Tweets'].head(5)

# Data Visualization

### Positive Negative Balance

In [None]:
from textblob import TextBlob
import tweepy

In [None]:
tweet_list = df['Tweets']
negative_list = df[df['Sentiment'] == "Negative"]
positive_list = df[df['Sentiment'] == "Positive"]

print("total number: ",len(tweet_list))
print("positive number: ",len(positive_list))
print("negative number: ", len(negative_list))

print()

print("% of positive: ",100*len(positive_list)/len(tweet_list))
print("% of negative: ",100*len(negative_list)/len(tweet_list))

In [None]:
import matplotlib.pyplot as plt

positive = int(100*len(positive_list)/len(tweet_list))
negative = int(100*len(negative_list)/len(tweet_list))

labels = ['Positive ['+str(positive)+'%]','Negative ['+str(negative)+'%]']
sizes = [positive, negative]
colors = ['yellowgreen', 'blue']
patches, texts = plt.pie(sizes,colors=colors, startangle=90, radius=50)
plt.style.use('default')
plt.legend(labels)
plt.title("Sentiment Analysis Result")
plt.axis("equal")
plt.show()

### Visualizing Top Words

The distribution of top unigrams after removing stop words.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
common_words = get_top_n_words(df['Tweets'], 20)
common_df = pd.DataFrame(common_words, columns = ['Tweets', 'count'])
common_df.head()

In [None]:
common_df.groupby('Tweets').sum()['count'].sort_values(ascending=False).plot(
    kind='bar',
    figsize=(8, 6),
    xlabel = "Top Words",
    ylabel = "Count",
    title = "Bar Chart of Top Words Frequency")

### Visualizing N-Grams

### Top Bigrams

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words2 = get_top_n_bigram(df['Tweets'], 30)

In [None]:
top_bigram = pd.DataFrame(common_words2, columns=['Tweets', "Count"])
top_bigram.head()

In [None]:
top_bigram.groupby('Tweets').sum()['Count'].sort_values(ascending=False).plot(
    kind='bar',
    figsize=(8,6),
    xlabel = "Bigram Words",
    ylabel = "Count",
    title = "Bar chart of Bigrams Frequency")

### Top Trigrams

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words3 = get_top_n_trigram(df['Tweets'], 30)
top_trigram = pd.DataFrame(common_words3, columns = ['Tweets' , 'Count'])
top_trigram.head(5)

In [None]:
top_trigram.groupby('Tweets').sum()['Count'].sort_values(ascending=False).plot(
    kind='bar',
    figsize=(8,6),
    xlabel = "Trigram Words",
    ylabel = "Count",
    title = "Bar chart of Trigrams Frequency")

## WordCloud

In [None]:
from wordcloud import WordCloud, STOPWORDS

def creat_wordcloud(tweets):
    comment_words = ''
    stopwords = set(STOPWORDS)
    
    # iterate through the csv file
    for val in tweets:

        # typecaste each val to string
        val = str(val)

        # split the value
        tokens = val.split()

        # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        comment_words += " ".join(tokens)+" "

    wordcloud = WordCloud(width = 1200, height = 800,
                    background_color ='white',
                    max_words=3000,
                    stopwords = stopwords,
                    min_font_size = 10,
                    repeat = True).generate(comment_words)

    # plot the WordCloud image                       
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)

    plt.show()

In [None]:
# All Tweets
creat_wordcloud(df['Tweets'].values)

In [None]:
# Positive Tweets
creat_wordcloud(df[df['Sentiment'] == 'Positive'].values)

In [None]:
# Negative Tweets
creat_wordcloud(df[df['Sentiment'] == 'Negative'].values)

# Deep Learning Model

## Preparing Data

In [None]:
import collections

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

In [None]:
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.Tweets, df.Sentiment, test_size=0.1, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [None]:
tk = Tokenizer()
tk.fit_on_texts(X_train)

print('Fitted tokenizer on {} documents'.format(tk.document_count))
print('{} words in dictionary'.format(tk.num_words))
print('Top 5 most common words are:', collections.Counter(tk.word_counts).most_common(5))

In [None]:
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

print('"{}" is converted into {}'.format(X_train[0], X_train_seq[0]))

In [None]:
def one_hot_seq(seqs, nb_features = NB_WORDS):
    ohs = np.zeros((len(seqs), nb_features))
    for i, s in enumerate(seqs):
        ohs[i, s] = 1.
    return ohs

X_train_oh = one_hot_seq(X_train_seq)
X_test_oh = one_hot_seq(X_test_seq)

print('"{}" is converted into {}'.format(X_train_seq[0], X_train_oh[0]))
print('For this example we have {} features with a value of 1.'.format(X_train_oh[0].sum()))

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

print('"{}" is converted into {}'.format(y_train[0], y_train_le[0]))
print('"{}" is converted into {}'.format(y_train_le[0], y_train_oh[0]))

In [None]:
X_train_rest, X_valid, y_train_rest, y_valid = train_test_split(X_train_oh, y_train_oh, test_size=0.1, random_state=37)

assert X_valid.shape[0] == y_valid.shape[0]
assert X_train_rest.shape[0] == y_train_rest.shape[0]

print('Shape of validation set:',X_valid.shape)

## Baseline Model

In [None]:
base_model = models.Sequential()
base_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
base_model.add(layers.Dense(64, activation='relu'))
base_model.add(layers.Dense(2, activation='softmax'))
base_model.summary()

In [None]:
NB_START_EPOCHS = 20  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent

def deep_model(model):
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train_rest
                       , y_train_rest
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=0)
    
    return history

In [None]:
base_history = deep_model(base_model)

In [None]:
def eval_metric(history, metric_name):
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

In [None]:
eval_metric(base_history, 'loss')

In [None]:
eval_metric(base_history, 'accuracy')

## Handling overfitting

* Option 1: reduce the network's size by removing layers or reducing the number of hidden elements in the layers
* Option 2: add regularization, which comes down to adding a cost to the loss function for large weights
* Option 3: adding dropout layers, which will randomly remove certain features by setting them to zero

### Reducing network's size

In [None]:
reduced_model = models.Sequential()
reduced_model.add(layers.Dense(32, activation='relu', input_shape=(NB_WORDS,)))
reduced_model.add(layers.Dense(2, activation='softmax'))
reduced_model.summary()

In [None]:
reduced_history = deep_model(reduced_model)

In [None]:
def compare_loss_with_baseline(h, model_name):
    loss_base_model = base_history.history['val_loss']
    loss_model = h.history['val_loss']

    e = range(1, NB_START_EPOCHS + 1)

    plt.plot(e, loss_base_model, 'bo', label='Validation Loss Baseline Model')
    plt.plot(e, loss_model, 'b', label='Validation Loss ' + model_name)
    plt.legend()
    plt.show()

In [None]:
compare_loss_with_baseline(reduced_history, 'Reduced Model')

### Adding regularization

In [None]:
reg_model = models.Sequential()
reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu', input_shape=(NB_WORDS,)))
reg_model.add(layers.Dense(64, kernel_regularizer=regularizers.l2(0.001), activation='relu'))
reg_model.add(layers.Dense(2, activation='softmax'))
reg_model.summary()

In [None]:
reg_history = deep_model(reg_model)

In [None]:
compare_loss_with_baseline(reg_history, 'Regularized Model')

### Adding dropout layers

In [None]:
drop_model = models.Sequential()
drop_model.add(layers.Dense(64, activation='relu', input_shape=(NB_WORDS,)))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(64, activation='relu'))
drop_model.add(layers.Dropout(0.5))
drop_model.add(layers.Dense(2, activation='softmax'))
drop_model.summary()

In [None]:
drop_history = deep_model(drop_model)

In [None]:
compare_loss_with_baseline(drop_history, 'Dropout Model')

### Training on the full train data and evaluation on test data

In [None]:
def test_model(model, epoch_stop):
    model.fit(X_train_oh
              , y_train_oh
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test_oh, y_test_oh)
    
    return results

In [None]:
base_results = test_model(base_model, 4)
print('/n')
print('Test accuracy of baseline model: {0:.2f}%'.format(base_results[1]*100))

In [None]:
reduced_results = test_model(reduced_model, 10)
print('/n')
print('Test accuracy of reduced model: {0:.2f}%'.format(reduced_results[1]*100))

In [None]:
reg_results = test_model(reg_model, 5)
print('/n')
print('Test accuracy of regularized model: {0:.2f}%'.format(reg_results[1]*100))

In [None]:
drop_results = test_model(drop_model, 6)
print('/n')
print('Test accuracy of dropout model: {0:.2f}%'.format(drop_results[1]*100))

## Word Embeddings

In [None]:
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 20  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 24  # Maximum number of words in a sequence
GLOVE_DIM = 50  # Number of dimensions of the GloVe word embeddings

In [None]:
def deep_model(model, X_train, y_train, X_valid, y_valid):
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(X_train
                       , y_train
                       , epochs=NB_START_EPOCHS
                       , batch_size=BATCH_SIZE
                       , validation_data=(X_valid, y_valid)
                       , verbose=1)
    return history

def test_model(model, X_train, y_train, X_test, y_test, epoch_stop):
    model.fit(X_train
              , y_train
              , epochs=epoch_stop
              , batch_size=BATCH_SIZE
              , verbose=0)
    results = model.evaluate(X_test, y_test) 
    return results

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.Tweets, df.Sentiment, test_size=0.1, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]

In [None]:
tk = Tokenizer(num_words=NB_WORDS)
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [None]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

In [None]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [None]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

assert X_valid_emb.shape[0] == y_valid_emb.shape[0]
assert X_train_emb.shape[0] == y_train_emb.shape[0]

print('Shape of validation set:',X_valid_emb.shape)

### Embedding Layer

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(NB_WORDS, 8, input_length=MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

In [None]:
emb_history = deep_model(emb_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)

In [None]:
eval_metric(emb_history, 'accuracy')

In [None]:
eval_metric(emb_history, 'loss')

In [None]:
emb_results = test_model(emb_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh, 6)
print('/n')
print('Test accuracy of word embeddings model: {0:.2f}%'.format(emb_results[1]*100))

### Pre-trained Word Embedding: GloVe

In [None]:
## !wget http://nlp.stanford.edu/data/glove.twitter.27B.zip

In [None]:
## !unzip glove.twitter.27B.zip

In [None]:
glove_file = 'glove.twitter.27B.' + str(GLOVE_DIM) + 'd.txt'
emb_dict = {}
glove = open(glove_file)
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:
some_words = ['ben', 'koltuk', 'kraliçe', 'masa']
for w in some_words:
    if w in emb_dict.keys():
        print('Found the word {} in the dictionary'.format(w))

In [None]:
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
glove_model = models.Sequential()
glove_model.add(layers.Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
glove_model.add(layers.Flatten())
glove_model.add(layers.Dense(2, activation='softmax'))
glove_model.summary()

In [None]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

In [None]:
glove_history = deep_model(glove_model, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)

In [None]:
eval_metric(glove_history, 'accuracy')

In [None]:
eval_metric(glove_history, 'loss')

In [None]:
glove_results = test_model(glove_model, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(glove_results[1]*100))

### Train w/ more dimensions

In [None]:
emb_model2 = models.Sequential()
emb_model2.add(layers.Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
emb_model2.add(layers.Flatten())
emb_model2.add(layers.Dense(2, activation='softmax'))
emb_model2.summary()

In [None]:
emb_history2 = deep_model(emb_model2, X_train_emb, y_train_emb, X_valid_emb, y_valid_emb)

In [None]:
eval_metric(emb_history2, 'accuracy')

In [None]:
eval_metric(emb_history2, 'loss')

In [None]:
emb_results2 = test_model(emb_model2, X_train_seq_trunc, y_train_oh, X_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word embedding model 2: {0:.2f}%'.format(emb_results2[1]*100))