In [1]:
import pandas as pd
pd.set_option('max_colwidth', -1)
import string
import nltk
nltk.download('stopwords')
from nltk import word_tokenize
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package stopwords to /home/ndv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ndv/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df = pd.read_csv('../dataset/train.csv')
df_test = pd.read_csv('../dataset/test.csv')
df.drop(columns=['location', 'keyword', 'target'], inplace=True)
df_test.drop(columns=['location', 'keyword'], inplace=True)
df = df.merge(df_test, how='outer')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10876 entries, 0 to 10875
Data columns (total 2 columns):
id      10876 non-null int64
text    10876 non-null object
dtypes: int64(1), object(1)
memory usage: 254.9+ KB


# Features Estadísticos sobre el texto.

Estos features están basados en el trabajo realizado en el TP1.

Antes de extraer los features, vamos a realizar un preprocesamiento para limpiar un poco el texto de los tweets de la misma forma que lo hicimos en el TP1.

Quitaremos stopwords, links, hashtags y menciones a usuarios.


In [4]:
# Quitamos las urls
df['text'] = df['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['text'] = df['text'].apply(clean_text)

df['word_count'] = df['text'].apply(lambda x: len(x.split(' ')))

In [5]:
df.head()

Unnamed: 0,id,text,word_count
0,1,deeds reason earthquake may allah forgive us,7
1,4,forest fire near la ronge sask canada,7
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,11
3,6,13000 people receive wildfires evacuation orders california,8
4,7,got sent photo ruby alaska smoke wildfires pours school,10


In [6]:
df['unique_words_count'] = df['text'].apply(lambda x: len(set(x.split(' '))))

In [7]:
word_count_mean = df['word_count'].mean()
unique_words_count_mean = df['unique_words_count'].mean()
df['word_count_above_mean'] = (df['word_count'] > word_count_mean).astype(int)
df['unique_words_count_above_mean'] = (df['unique_words_count'] > unique_words_count_mean).astype(int)
df.head()

Unnamed: 0,id,text,word_count,unique_words_count,word_count_above_mean,unique_words_count_above_mean
0,1,deeds reason earthquake may allah forgive us,7,7,0,0
1,4,forest fire near la ronge sask canada,7,7,0,0
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,11,9,1,0
3,6,13000 people receive wildfires evacuation orders california,8,8,0,0
4,7,got sent photo ruby alaska smoke wildfires pours school,10,10,1,1


In [8]:
df['char_count'] = df['text'].apply(lambda x: len(x)-x.count(" "))
char_count_mean = df['char_count'].mean()
df['char_count_above_mean'] = (df['char_count'] > char_count_mean).astype(int)

In [9]:
df.head()

Unnamed: 0,id,text,word_count,unique_words_count,word_count_above_mean,unique_words_count_above_mean,char_count,char_count_above_mean
0,1,deeds reason earthquake may allah forgive us,7,7,0,0,38,0
1,4,forest fire near la ronge sask canada,7,7,0,0,31,0
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,11,9,1,0,78,1
3,6,13000 people receive wildfires evacuation orders california,8,8,0,0,53,1
4,7,got sent photo ruby alaska smoke wildfires pours school,10,10,1,1,47,0


In [10]:
def avg(data):
    return sum(data) / len(data)

df['avg_word_length'] = df['text'].apply(lambda x: avg([len(word) for word in x.split(' ')]))

In [11]:
df.head()

Unnamed: 0,id,text,word_count,unique_words_count,word_count_above_mean,unique_words_count_above_mean,char_count,char_count_above_mean,avg_word_length
0,1,deeds reason earthquake may allah forgive us,7,7,0,0,38,0,5.428571
1,4,forest fire near la ronge sask canada,7,7,0,0,31,0,4.428571
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,11,9,1,0,78,1,7.090909
3,6,13000 people receive wildfires evacuation orders california,8,8,0,0,53,1,6.625
4,7,got sent photo ruby alaska smoke wildfires pours school,10,10,1,1,47,0,4.7


In [12]:
# Este valor fue tomado del repositorio del módulo VADER.
POLARITY_TRESHOLD = 0.05

def obtain_sentiment(text):
    scores = analyzer.polarity_scores(text)
    
    sentiment = 'neu'
    if scores['compound'] >= POLARITY_TRESHOLD:
        sentiment = 'pos'
    if scores['compound'] <= -POLARITY_TRESHOLD:
        sentiment = 'neg'
    
    text_blob = TextBlob(text)
    subjectivity = text_blob.sentiment.subjectivity
    
    return sentiment, subjectivity, scores['compound']

df['sentiment'], df['subjectivity'], df['polarity'] = zip(*df['text'].apply(obtain_sentiment))
df.head()

Unnamed: 0,id,text,word_count,unique_words_count,word_count_above_mean,unique_words_count_above_mean,char_count,char_count_above_mean,avg_word_length,sentiment,subjectivity,polarity
0,1,deeds reason earthquake may allah forgive us,7,7,0,0,38,0,5.428571,pos,0.0,0.2732
1,4,forest fire near la ronge sask canada,7,7,0,0,31,0,4.428571,neg,0.4,-0.34
2,5,residents asked shelter place notified officers evacuation shelter place orders expected,11,9,1,0,78,1,7.090909,neu,0.4,0.0
3,6,13000 people receive wildfires evacuation orders california,8,8,0,0,53,1,6.625,neu,0.0,0.0
4,7,got sent photo ruby alaska smoke wildfires pours school,10,10,1,1,47,0,4.7,neu,0.0,0.0


In [13]:
df = pd.get_dummies(df, columns=['sentiment'], prefix='', prefix_sep='')

In [14]:
df.drop(columns=['text']).to_csv('../features/features_basicas_texto.csv', index=False)