In [55]:
# https://towardsdatascience.com/tf-idf-for-document-ranking-from-scratch-in-python-on-real-world-dataset-796d339a4089
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
import numpy as np
from num2words import num2words
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import re
import seaborn as sns

import spacy
import en_core_web_sm

#https://www.nltk.org/howto/stem.html
from nltk.stem.porter import *


In [56]:
dataset = pd.read_csv('src/train.csv')
dataset.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [57]:
pourcentage_valeur_manquante = 100*dataset.isnull().sum()/dataset.shape[0]
print(pourcentage_valeur_manquante)

id           0.000000
keyword      0.801261
location    33.272035
text         0.000000
target       0.000000
dtype: float64


In [58]:
dataset = dataset.drop(columns=['id', 'location'])

In [59]:
dataset = dataset.dropna()
dataset['keyword'] = [ x.replace('%20', ' ') for x in dataset['keyword']]
len(dataset['keyword'].unique())

221

In [60]:
len(dataset)

7552

In [61]:
# https://medium.com/@yashj302/stopwords-nlp-python-4aa57dc492af
nlp = en_core_web_sm.load()

stopwords = nlp.Defaults.stop_words
print(stopwords)

def cleanStopWorld(text):
    cleanedtext = []
    for item in nlp(text):
        if not item.is_stop:
            cleanedtext.append(item.text)
    return ' '.join(cleanedtext)
    

{'full', 'nevertheless', 'are', 'someone', 'nobody', 'be', 'go', 'then', 'same', 'or', 'ten', 'each', 'your', 'something', 'everything', "'re", 'yet', 'anywhere', 'than', 'off', 'amongst', 'again', 'mostly', 'really', 'less', '’d', "'d", 'call', 'becoming', 'due', 'former', 'could', 'he', 'thereby', 'seemed', 'fifty', 'formerly', 'might', 'whole', 'six', '‘m', 'neither', 'part', 'various', 'from', 'moreover', 'via', 'still', 'already', 'there', 'serious', 'because', 'is', 'what', 'many', 'several', 'so', 'will', "'ll", 'to', 'few', 'mine', 'therein', 'n’t', 'somewhere', 'that', 'everyone', 'say', 'hers', 'herein', 'whose', 'please', 'never', 'made', 'toward', 'though', '’ve', 'herself', 'about', 'not', 'however', 'at', '’m', 'towards', 'when', 'perhaps', 'can', 'below', 'get', "'s", 'fifteen', 'how', 'hence', 'whereas', 'own', 'along', 'on', 'her', 'against', 'eleven', 'our', 'among', 'over', 'done', 'put', 'empty', 'but', 'upon', 'give', 'somehow', 'whether', 'beyond', 'ca', 'alone', 

In [62]:
def eraseSingleChar(words):
    new_text = ""
    for w in list(words.split(" ")):
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [63]:
# convert exemple "Player", "Playing" to "play"
stemmer = PorterStemmer()

def stemConverter(sentence):
    new_text = ""
    for w in list(sentence.split(" ")):
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [64]:
# from numbers to string 
def convert_num_to_words(utterance):
      utterance = ' '.join([num2words.num2words(i) if i.isdigit() else i for i in utterance.split()])
      return utterance

In [65]:
for column in ['keyword', 'text']:
    dataset[column] = [ np.char.lower(x) for x in dataset[column]]
    dataset[column] = [ str(x) for x in dataset[column] ] # str check for "numbers"
    dataset[column] = [ x.replace("\\/", "/").encode().decode('utf-8') for x in dataset[column] ] # magic line for error byte string
    dataset[column] = [ re.sub(r'http\S+', '', x) for x in dataset[column] ] # delete http 
    dataset[column] = [ re.sub('[^A-Za-z0-9 ]+', '', x) for x in dataset[column] ] # delete @#$€...
    #
    dataset[column] = [ cleanStopWorld(x) for x in dataset[column] ] # erase stop world 
    #
    dataset[column] = [ eraseSingleChar(x) for x in dataset[column] ] # erase isolate 2 chars
    #
    dataset[column] = [ stemConverter(x) for x in dataset[column] ] # convert to stem words

In [66]:
dataset = dataset.dropna()

In [67]:
dataset

Unnamed: 0,keyword,text,target
31,ablaz,bbcmtd wholesal market ablaz,1
32,ablaz,tri bring heavi metal rt,0
33,ablaz,africanbaz break newsnigeria flag set ablaz aba,1
34,ablaz,cri set ablaz,0
35,ablaz,plu look sky night ablaz,0
...,...,...,...
7578,wreck,jtruff23 cameronhack wreck,0
7579,wreck,day work ve pretti wreck hahaha shoutout famili,0
7580,wreck,fx forex trade cramer iger word wreck disney...,0
7581,wreck,enginesh great atmospher british lion gig to...,0
