In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

import pandas as pd
import re

import emoji
from geotext import GeoText

In [2]:
paths = [
    "../data/csv/run-1606575584664-part-r-00000",
    "../data/csv/run-1606575584664-part-r-00001",
    "../data/csv/run-1606575584664-part-r-00002",
    "../data/csv/run-1606575584664-part-r-00003",
    "../data/csv/run-1606575584664-part-r-00004",
    "../data/csv/run-1606575584664-part-r-00005",
    "../data/csv/run-1606575584664-part-r-00006",
    "../data/csv/run-1606575584664-part-r-00007",
    "../data/csv/run-1606575584664-part-r-00008",
    "../data/csv/run-1606575584664-part-r-00009",
    "../data/csv/run-1606575584664-part-r-00010",
    "../data/csv/run-1606575584664-part-r-00011"
]
dfs = []
for path in paths:
    df_ = pd.read_csv(path, engine='c')
    dfs.append(df_)
df = pd.concat(dfs)

In [3]:
df.shape

(17597, 9)

In [4]:
regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)

# stemmer español
stemmer = SnowballStemmer('english')

In [5]:
def clean(texto):
    texto = str(texto)
    texto = texto.lower()
    texto = ' '.join(s for s in texto.split() if not any(c.isdigit() for c in s))
    texto = re.sub("[.*?]","",texto)
    texto = re.sub('https?://\S+|WWW\.\S+','',texto)
    texto = re.sub("<.*?>+","",texto)
    texto = re.sub("``","",texto)
    texto = re.sub("''","",texto)
    texto = regrex_pattern.sub(r'',texto)
    texto = re.sub('[!@#$%]', '', texto)
    
    return texto

In [6]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [7]:
df['clean_keyword'] = df['keyword'].apply(lambda x: clean(x))
df['keyword'] = stem_tokens(df['clean_keyword'].astype(str),stemmer)

In [8]:
df = pd.get_dummies(df, prefix='keyword', columns=['keyword'])

In [9]:
df['finished_lemma'] = df['finished_lemma'].astype(str)
df['target'] = df['target'].apply(lambda x: int(x) if str(x) == '0' or str(x) == '1' else 2)
df = df[df['target'] <= 1]

In [10]:
tokenizer = Tokenizer(num_words=100000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["finished_lemma"])
word_index = tokenizer.word_index

In [11]:
df["sequences"] = tokenizer.texts_to_sequences(df["finished_lemma"])

In [12]:
max_ = 0
for i in list(df["sequences"].values):
    if len(i) > max_:
        max_ = len(i)
print("tweet con mas tokens", max_)

tweet con mas tokens 23


In [13]:
df["sequences"] = pad_sequences(df["sequences"], maxlen=max_, padding="post").tolist()

In [14]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=15, algorithm='randomized', n_iter=100, random_state=122)

sequences_reduced = svd_model.fit_transform(df["sequences"].values.tolist())
df['sequences_reduced'] = sequences_reduced.tolist()

In [15]:
df['sequences_reduced'] = df['sequences_reduced'].apply(lambda x: ' '.join([str(elem) for elem in x]))
df['sequences'] = df['sequences'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [16]:
df['location'] = df['location'].fillna(' ')

In [17]:
def replace_emoji(row):
    return emoji.demojize(row['location'], delimiters=("", " "))

df['location_emoji'] = df.apply(lambda row: replace_emoji(row), axis=1)

In [18]:
def replace_chars(row):
    res = re.sub('[^a-zA-Z ]', '', row['location_emoji'])
    res = res.replace("  ", " ")
    return res

df['location_chars'] = df.apply(lambda row: replace_chars(row), axis=1)

In [19]:
def get_countries(row):
    res = list(GeoText(row['location_chars']).country_mentions.keys())
    return res[0] if len(res) >= 1 else ''

df['country'] = df.apply(lambda row: get_countries(row), axis=1)

In [20]:
def get_cities(row):
    res = GeoText(row['location_chars']).cities
    return res[0] if len(res) >= 1 else ''

df['city'] = df.apply(lambda row: get_cities(row), axis=1)

In [23]:
df.drop(columns=['location', 'location_emoji', 'location_chars'], inplace=True)

In [24]:
df.to_csv('clean_data.csv')