In [1]:
import sqlite3
import pandas as pd
import utils
import re
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
pd.set_option('display.max_rows', None)


In [2]:
def get_most_common_ngrams(df, n_words=None, ngram_range=(1, 1)):
    ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=ngram_range, min_df=1)
    count_vectors = ngram_vectorizer.fit_transform(df['preprocessed_text'])
    vocab = list(ngram_vectorizer.get_feature_names())
    if n_words == None:
        n_words = len(vocab)
    counts = count_vectors.sum(axis=0).A1
    freq_distribution = Counter(dict(zip(vocab, counts)))
    return freq_distribution.most_common(n_words)


In [3]:
con = sqlite3.connect("tweets.db")
sql = "select id_str, text from tweets where text not like '%RT @%' order by random() limit 10000"
df = pd.read_sql(sql, con=con, index_col="id_str")
df = utils.preprocess(df)
df.drop_duplicates(subset=['preprocessed_text'], inplace=True)
df = df[df.preprocessed_text.apply(len) > 0]
most_common_ngrams = get_most_common_ngrams(df, n_words=None, ngram_range=(1,1))
most_common_ngrams_df = pd.DataFrame(most_common_ngrams, columns=['term', 'frequency'])
#most_common_ngrams_df.sort_values(by='frequency').plot(kind='barh', x='term')

In [4]:
df.preprocessed_text.to_list()[:10]

['nao quer dizer bexiga nenhuma mas o cara ve a escalacao e nao ha como pensar diferente o e sim superior ao',
 'gol em cima do rodinei obrigado elusmar por ter pagado milhao pro flamengo',
 'quinta junto com o do flamengo e inter',
 'flamengo eu te amo',
 'e que vc nao falou de quem tava falando neh enzin pro flamengo ser campeao gtprecisalt ganhar do sp ou o',
 'prefiro mil vezes o inter do que o flamengo',
 'vamosssessss flamengo',
 'pra cima deles flamengo',
 'flamengo quer me matar nao e possivel',
 'eu to ok pq eu sei que eu nao vou conseguir dormir tao cedo pq meu pai nao sabe assistir o jogo do flamengo com a b']

In [5]:
VOCAB_SIZE = len(most_common_ngrams_df.term)
print(VOCAB_SIZE)
encoded_text = [one_hot(text, VOCAB_SIZE) for text in df.preprocessed_text.to_list()]

8831


In [6]:
encoded_text[:10]

[[8280,
  3315,
  2774,
  4547,
  9,
  1398,
  856,
  2781,
  2353,
  1921,
  2580,
  2391,
  8280,
  5231,
  1462,
  2252,
  243,
  856,
  2391,
  1107,
  3149,
  871],
 [3385, 8061, 8068, 7079, 5208, 4993, 5404, 8098, 581, 4110, 6661, 6527, 3056],
 [3325, 269, 7806, 856, 7079, 3056, 2391, 3428],
 [3056, 1647, 890, 8404],
 [2391,
  3338,
  6147,
  8280,
  8400,
  8822,
  3445,
  2289,
  2067,
  5969,
  1138,
  6527,
  3056,
  2457,
  2310,
  3473,
  3311,
  7079,
  8462,
  4309,
  856],
 [982, 8551, 7064, 856, 3428, 7079, 3338, 856, 3056],
 [6934, 3056],
 [2071, 8068, 7484, 3056],
 [3056, 3315, 7067, 5860, 8280, 2391, 1607],
 [1647,
  1125,
  3736,
  6711,
  1647,
  2014,
  3338,
  1647,
  8280,
  2846,
  5441,
  6701,
  8657,
  8000,
  6711,
  1986,
  4913,
  8280,
  7692,
  6768,
  856,
  6833,
  7079,
  3056,
  7806,
  1921,
  4835]]

In [7]:
MAX_LENGHT = max([len(text) for text in df.preprocessed_text.to_list()])
print(MAX_LENGHT)
padded_texts = pad_sequences(encoded_text, maxlen=MAX_LENGHT, padding='post')

140


In [12]:
padded_texts[0].shape

(140,)

In [9]:
DIM = 10
model = Sequential()
model.add(Embedding(VOCAB_SIZE, DIM, input_length=MAX_LENGHT))
model.compile()

In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 10)           88310     
Total params: 88,310
Trainable params: 88,310
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.predict(padded_texts)

array([[[-8.3264001e-03, -1.0250248e-02, -4.1753866e-02, ...,
         -4.6131779e-02, -1.2025215e-02,  2.0456102e-02],
        [-3.9486717e-02,  1.7496396e-02,  2.1687079e-02, ...,
          1.0509975e-03,  3.0942742e-02,  4.7593858e-02],
        [ 3.0824963e-02, -1.2705039e-02, -2.1551406e-02, ...,
         -3.1166626e-02, -3.7197806e-02,  1.7631650e-03],
        ...,
        [-1.1120796e-02,  4.6502355e-02,  2.3416627e-02, ...,
         -2.0029986e-02,  4.1103516e-02, -3.8796999e-02],
        [-1.1120796e-02,  4.6502355e-02,  2.3416627e-02, ...,
         -2.0029986e-02,  4.1103516e-02, -3.8796999e-02],
        [-1.1120796e-02,  4.6502355e-02,  2.3416627e-02, ...,
         -2.0029986e-02,  4.1103516e-02, -3.8796999e-02]],

       [[-1.9937551e-02, -2.4822926e-02,  4.8705649e-02, ...,
         -1.9789755e-02,  2.4919007e-02,  3.8827989e-02],
        [-9.0816841e-03,  1.1801530e-02,  8.6209774e-03, ...,
          4.6967123e-02, -3.7279535e-02, -7.5995103e-03],
        [-3.7533499e-02, 