In [71]:
# import tensorflow_datasets as tfds
import re
import string

import contractions
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pandas import DataFrame
from spellchecker import SpellChecker

About Sentiment140

This is the sentiment140 dataset.

It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .

It contains the following 6 fields:

1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
2. ids: The id of the tweet ( 2087)
3. date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
4. flag: The query (lyx). If there is no query, then this value is NO_QUERY.
5. user: the user that tweeted (robotickilldozr)
6. text: the text of the tweet (Lyx is cool)

Cargar datos desde los datasets de tensorflow

In [6]:
# df = tfds.load("sentiment140", data_dir="./input")

In [72]:
columns = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv("./input/sentiment140.csv", encoding="ISO-8859-1", names=columns)

In [3]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


Eliminacion de las columnas que no son necesarias

In [73]:
df.drop(["id", "date", "flag", "user"], axis="columns", inplace=True)
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Como vemos no hay valores faltantes, pero si hay algunas filas duplicadas

In [74]:
def summary(df: DataFrame):
    print("Shape: ", df.shape)
    print("Duplicate rows: ", df.duplicated().sum())
    return pd.DataFrame(
        index=df.columns,
        data={
            "Unique": df.nunique().values,
            "Missing": df.isnull().sum().values,
            "Type": df.dtypes,
        },
    )

In [39]:
summary(df)

Shape:  (1600000, 2)
Duplicate rows:  16309


Unnamed: 0,Unique,Missing,Type
target,2,0,int64
text,1581466,0,object


Eliminacion de filas duplicadas

In [75]:
df.drop_duplicates(inplace=True)

In [16]:
for i in range(100):
    print(df["text"][i])

@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!
@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds
my whole body feels itchy and like its on fire 
@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. 
@Kwesidei not the whole crew 
Need a hug 
@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?
@Tatiana_K nope they didn't have it 
@twittera que me muera ? 
spring break in plain city... it's snowing 
I just re-pierced my ears 
@caregiving I couldn't bear to watch it.  And I thought the UA loss was embarrassing . . . . .
@octolinz16 It it counts, idk why I did either. you never talk to me anymore 
@smarrison i would've been the first, but i didn't have a gun.    not rea

Inicializacion de variables a usar en la limpieza de datos

In [76]:
spell = SpellChecker()
wln = WordNetLemmatizer()
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/jared/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jared/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [77]:
def correct_spellings(text: str) -> str:
    words = text.split()
    misspelled = spell.unknown(words)
    for i in range(len(words)):
        if words[i] in misspelled:
            word = spell.correction(words[i])
            if word:
                words[i] = word  # type: ignore
    return " ".join(words)

In [69]:
correct_spellings("corect is you runing")

'correct is you running'

In [78]:
def delete_stopwords(text: str):
    stop_words = stopwords.words("english")
    words = [word for word in text.split() if word not in stop_words]
    return " ".join(words)

In [7]:
delete_stopwords("the book on the table")

'book table'

In [79]:
def lemmatizer(text: str):
    words = [wln.lemmatize(word, pos="v") for word in text.split()]
    return " ".join(words)

In [9]:
lemmatizer("i'm running, walking speaking buying sleeping")

"i'm running, walk speak buy sleep"

In [55]:
def clean_text(text: str):
    if not text == "":
        text = text.lower()  # Convertir minusculas todo el texto
        text = re.sub(
            r"@[\S]+", "", text
        )  # Eliminar los nombres de usuarios con @ mencionados
        text = re.sub(
            r"((www\.[\S]+)|([https]+://[\S]+))", "", text
        )  # Eliminar las urls mencionadas
        text = re.sub(
            r"^\s+|\s+$|\s+(?=\s)", "", text
        )  # Eliminar espacios en blanco extras
        text = contractions.fix(text)  # type: ignore # Expandir las contracciones

        text = re.sub(
            "[%s]" % re.escape(string.punctuation), "", text
        )  # Eliminar signos de puntuacion
        text = re.sub(r"\w*\d\w*", "", text)  # Eliminar numeros y palabras con numeros
        text = correct_spellings(text)  # Corregir ortografia de palabras
        text = delete_stopwords(text)  # Eliminar palabas comunes
        text = lemmatizer(text)  # Convertir las palabras a su verbo base
    return text

In [40]:
text = df["text"].apply(clean_text)

In [70]:
text_full = df["text"].apply(clean_text)

In [32]:
spell.correction("the")

'the'

In [None]:
[
    "i would have been the first but i did not have a gun not really though zap spiders just a doucheclown"
]

In [67]:
correct_spellings(
    "i would have been the first but i did not have a gun not really though zap spders jus a doucheclown"
)

'i would have been the first but i did not have a gun not really though zap spiders just a doucheclown'

In [80]:
spell.correction("doucheclown")