In [1]:
import re
import pandas as pd
import numpy as np
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopw=set(stopwords.words('english'))
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
embedding_dict={}
with open('data_process/glove.twitter.27B.100d.txt','r') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [3]:
text=pd.read_csv('data_process/textcleaned.csv',low_memory=False)

In [4]:
def text_process(s):
    s = re.sub(r'[^a-zA-Z]', ' ',  s)
    s = s.lower()
    s = word_tokenize(s)
    return(s)

In [5]:
text = text['text'].apply(text_process)
corpora = text.apply(lambda x: [w for w in x if (w not in stopw)])

In [7]:
MAX_LEN=100
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpora)
sequences=tokenizer_obj.texts_to_sequences(corpora)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [8]:
len(tokenizer_obj.word_index)

19918

In [9]:
%%time
word_index=tokenizer_obj.word_index
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,100))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec

100%|██████████| 19918/19918 [00:00<00:00, 332626.53it/s]

CPU times: user 59.2 ms, sys: 9.96 ms, total: 69.2 ms
Wall time: 76.1 ms





In [10]:
embedding_matrix.shape

(19919, 100)

In [11]:
embedding_matrix = pd.DataFrame(embedding_matrix)
embedding_matrix.to_csv('data_process/embedding_matrix',index=False)

In [12]:
import pickle
pickle_out = open("data_process/tweet_pad.pickle","wb")
pickle.dump(tweet_pad, pickle_out)
pickle_out.close()