In [17]:
# https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np
from num2words import num2words
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import re
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split

import spacy
import en_core_web_sm


# Tfidf transformer 
from sklearn.feature_extraction.text import TfidfVectorizer

#https://www.nltk.org/howto/stem.html
from nltk.stem.porter import *


In [18]:
dataset = pd.read_csv('src/train.csv')
dataset = dataset.drop(columns=['id', 'location'])
dataset = dataset.dropna()
dataset['keyword'] = [ x.replace('%20', ' ') for x in dataset['keyword']]
len(dataset['keyword'].unique())

# https://medium.com/@yashj302/stopwords-nlp-python-4aa57dc492af
nlp = en_core_web_sm.load()

def cleanStopWorld(text):
    cleanedtext = []
    for item in nlp(text):
        if not item.is_stop:
            cleanedtext.append(item.text)
    return ' '.join(cleanedtext)

def eraseSingleChar(words):
    new_text = ""
    for w in list(words.split(" ")):
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

# convert exemple "Player", "Playing" to "play"
stemmer = PorterStemmer()

def stemConverter(sentence):
    new_text = ""
    for w in list(sentence.split(" ")):
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

# from numbers to string 
def convert_num_to_words(utterance):
      utterance = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in utterance.split()])
      return utterance

for column in ['keyword', 'text']:
    dataset[column] = [ np.char.lower(x) for x in dataset[column]]
    dataset[column] = [ str(x) for x in dataset[column] ] # str check for "numbers"
    dataset[column] = [ x.replace("\\/", "/").encode().decode('utf-8') for x in dataset[column] ] # magic line for error byte string
    dataset[column] = [ re.sub(r'http\S+', '', x) for x in dataset[column] ] # delete http 
    dataset[column] = [ re.sub('[^A-Za-z0-9 ]+', '', x) for x in dataset[column] ] # delete @#$€...
    #
    dataset[column] = [ cleanStopWorld(x) for x in dataset[column] ] # erase stop world 
    #
    dataset[column] = [ eraseSingleChar(x) for x in dataset[column] ] # erase isolate 2 chars
    #
    dataset[column] = [ stemConverter(x) for x in dataset[column] ] # convert to stem words

dataset = dataset.dropna()

dataset['text'] = dataset['keyword'].astype(str) + dataset['text'].astype(str)

In [19]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=1000, oov_token="out_of_vocab") # instanciate the tokenizer
tokenizer.fit_on_texts(dataset.text)
dataset["text_encoded"] = tokenizer.texts_to_sequences(dataset.text)

In [20]:
dataset.head()

Unnamed: 0,keyword,text,target,text_encoded
31,ablaz,ablaz bbcmtd wholesal market ablaz,1,"[213, 1, 1, 310, 213]"
32,ablaz,ablaz tri bring heavi metal rt,0,"[213, 220, 453, 697, 962, 90]"
33,ablaz,ablaz africanbaz break newsnigeria flag set...,1,"[213, 1, 270, 1, 527, 221, 213, 910]"
34,ablaz,ablaz cri set ablaz,0,"[213, 624, 221, 213]"
35,ablaz,ablaz plu look sky night ablaz,0,"[213, 1, 48, 724, 285, 213]"


In [21]:
dataset.to_csv(r'src/dataset_encoded.csv', index=False)