El siguiente notebook tiene como objetivo generar un nuevo set de datos con nuevos features, particularmente aquellos que fueron generados en el TP1 (y otros adicionales). Para posteriormente ser usado y modificado en uno o más modelos de ML a probar.  

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from textblob import TextBlob
from geopy.geocoders import Nominatim



Set de datos original y se elimina la columna id

In [2]:
tweets = pd.read_csv('data/train.csv')
tweets.drop(columns=['id'],inplace = True)


keyword_grouped (categórica): puede corresponder al mismo keyword del tweet o a una parecida (la parecida se obtiene a partir de un agrupamiento de palabras que tienen un ratio superior al 75% según fuzz)

In [3]:
tweets['keyword'] = tweets['keyword'].transform(lambda x: x.str.replace("%20", " "))

def get_keyword_dic(key_list):
    key_dic = {}
    unique_list = []
    for x in key_list:
        similarity = 0
        value = ""
        for y in unique_list:
            ratio = fuzz.ratio(x,y)
            if ratio > similarity:
                similarity = ratio
                value = y
        if similarity > 75:
            key_dic[x] = value
        else:
            key_dic[x] = x
            unique_list.append(x)
    return key_dic

key_grouped = get_keyword_dic(tweets.keyword.dropna().tolist())

key_grouped['blazing'] = 'ablaze'
key_grouped['bleeding'] = 'blood'
key_grouped['buildings on fire'] = 'buildings burning'
key_grouped['burning buildings'] = 'buildings burning'
key_grouped['burning'] = 'burned'
key_grouped['dead'] = 'death'
key_grouped['demolition'] = 'demolish'
key_grouped['destruction'] = 'destroy'
key_grouped['explosion'] = 'explode'
key_grouped['flood'] = 'flooding'
key_grouped['floods'] = 'flooding'
key_grouped['inundated'] = 'inundation'
key_grouped['panic'] = 'panicking'
key_grouped['rainstorm'] = 'rainstorm'
key_grouped['riot'] = 'rioting'
key_grouped['screaming'] = 'screamed'
key_grouped['snowstorm'] = 'snowstorm'
key_grouped['survivors'] = 'survive'
key_grouped['traumatised'] = 'trauma'
key_grouped['violent storm'] = 'storm'
key_grouped['windstorm'] = 'storm'
key_grouped['traumatised'] = 'trauma'

tweets['keyword_grouped'] = tweets.keyword.transform(lambda x: pd.NA if pd.isna(x) else key_grouped[x])

text_contain_keyword (bool): indica si el texto del tweet contiene la keyword (para los casos que no hay keyword entonces es null). Las comparaciones de los strings se hacen convertidas en minúsculas previamente

In [4]:
def serie_contain_other_serie(x,y):
    if pd.isna(y):
        return pd.NA
    return y.lower() in x.lower()

tweets['text_contain_keyword'] = tweets.apply(lambda x: serie_contain_other_serie(x.text,x.keyword_grouped), axis = 1)


total_words (numérica discreta): total de palabras del texto (se considera palabra todo aquello que es separado por un un espacio)

In [5]:
tweets['total_words'] = tweets.text.transform(lambda x: len(x.split(" ")))

len_words (numérica discreta): total de caracteres del texto

In [6]:
tweets['len_text'] = tweets.text.transform(lambda x: len(x))


total_upper_chars (numérica discreta): total de caracteres en mayúsculas del texto

In [7]:
def get_upper_total(s):
    total = 0
    for x in s:
        y = ord(x)
        if y > 90:
            continue
        if y > 64:
            total = total + 1
    return total

tweets['total_upper_chars'] = tweets.text.transform(lambda x: get_upper_total(x))

total_numbers_chars (numérica discreta): total de caracteres numéricos del texto

In [8]:
def get_total_numbers_chars(s):
    total = 0
    for x in s:
        y = ord(x)
        if y in range(48,58):
            total = total + 1
    return total

tweets['total_numbers_chars'] = tweets.text.transform(lambda x: get_total_numbers_chars(x))

total_special_chars (numérica discreta): total de caracteres especiales del texto

In [9]:
def get_special_chars_total(s):
    total = 0
    common_esp_chr = [" ",".","?",",","!"]
    for x in s:
        y = ord(x)
        if y in range(97,123) or y in range(65,91) or y in range(48,58) or x in common_esp_chr:
            continue
        total = total + 1
    return total

tweets['total_special_chars'] = tweets.text.transform(lambda x: get_special_chars_total(x))

total_common_chars (numérica discreta): total de caracteres comunes del texto

In [10]:
def get_common_chars_total(s):
    total = 0
    common_esp_chr = [" ",".","?",",","!"]
    for x in s:
        if  x in common_esp_chr:
            total = total + 1
    return total

tweets['total_common_chars'] = tweets.text.transform(lambda x: get_common_chars_total(x))

contain_question (bool): contiene preguntas el texto

In [11]:
tweets['contain_question'] = tweets.text.transform(lambda x: "?" in x)

contain_link (bool): contiene enlaces el texto

In [12]:
tweets['contain_link'] = tweets.text.transform(lambda x: "http" in x)

contain_hashtag (bool): contiene hashtag el texto

In [13]:
tweets['contain_hashtag'] = tweets.text.transform(lambda x: "#" in x)

contain_upper_words (bool): contiene palabras escritas totalmente en mayusculas el texto (de al menos 3 caracteres)

In [14]:
def contain_upper_words(s):
    for x in s.split(" "):
        if  len(x) > 2 and x.isupper():
            return True
    return False

tweets['contain_upper_words'] = tweets.text.transform(lambda x: contain_upper_words(x))

total_n_words (numérica discreta): total de palabras de n caracteres del texto

In [15]:
def get_nlenght_words_total(n,s):
    total = 0
    for x in s.split(" "):
        if  len(x) == n:
            total = total + 1
    return total

tweets['total_3_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(3,x))
tweets['total_4_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(4,x))
tweets['total_5_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(5,x))
tweets['total_6_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(6,x))
tweets['total_7_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(7,x))
tweets['total_8_words'] = tweets.text.transform(lambda x: get_nlenght_words_total(8,x))


total_n_ormore_words (numérica discreta): total de palabras de al menos n caracteres del texto

In [16]:
def get_n_ormore_lenght_words_total(n,s):
    total = 0
    for x in s.split(" "):
        if  len(x) >= n:
            total = total + 1
    return total

tweets['total_3_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(3,x))
tweets['total_4_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(4,x))
tweets['total_5_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(5,x))
tweets['total_6_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(6,x))
tweets['total_7_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(7,x))
tweets['total_8_ormore_words'] = tweets.text.transform(lambda x: get_n_ormore_lenght_words_total(8,x))

total_n_orless_words (numérica discreta): total de palabras no superiores a n caracteres en el texto

In [17]:
def get_n_orless_lenght_words_total(n,s):
    total = 0
    for x in s.split(" "):
        if  len(x) <= n:
            total = total + 1
    return total

tweets['total_3_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(3,x))
tweets['total_4_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(4,x))
tweets['total_5_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(5,x))
tweets['total_6_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(6,x))
tweets['total_7_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(7,x))
tweets['total_8_orless_words'] = tweets.text.transform(lambda x: get_n_orless_lenght_words_total(8,x))

subjectivity_text (numérica continua): subjetividad del texto según TextBlob

In [18]:
def subjectivityText(x):
    return TextBlob(x).sentiment.subjectivity

tweets['subjectivity_text'] = tweets.text.transform(lambda x: subjectivityText(x))

polarity_text (numérica continua): polaridad del texto según TextBlob

In [19]:
def polarityText(x):
    return TextBlob(x).sentiment.polarity
tweets['polarity_text'] = tweets.text.transform(lambda x: polarityText(x))

No tomar en cuenta

In [20]:
geolocator = Nominatim(user_agent="orga_datos")
def getGeoData(x):
    if pd.isna(x):
        return pd.NA
    
    l = geolocator.geocode(x, timeout=1,country_codes=["US","UC","CA","IN","AU","FR"])
    
    if l == None:
        return pd.NA
    return (l.address, l.latitude, l.longitude)

#tweets['address'] = tweets.location.transform(lambda x: getGeoData(x))

In [21]:
target = tweets.target
tweets.drop(columns=['target'], inplace=True)
tweets['target'] = target
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   keyword               7552 non-null   object 
 1   location              5080 non-null   object 
 2   text                  7613 non-null   object 
 3   keyword_grouped       7552 non-null   object 
 4   text_contain_keyword  7552 non-null   object 
 5   total_words           7613 non-null   int64  
 6   len_text              7613 non-null   int64  
 7   total_upper_chars     7613 non-null   int64  
 8   total_numbers_chars   7613 non-null   int64  
 9   total_special_chars   7613 non-null   int64  
 10  total_common_chars    7613 non-null   int64  
 11  contain_question      7613 non-null   bool   
 12  contain_link          7613 non-null   bool   
 13  contain_hashtag       7613 non-null   bool   
 14  contain_upper_words   7613 non-null   bool   
 15  total_3_words        

In [22]:
#tweets.to_csv('data/train_pre_processing.csv', index=False)