In [1]:
import sys
import os
from os.path import join
from pprint import pprint
sys.path.append(os.path.abspath("../../../"))
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display, HTML
from IPython.display import Markdown
import ipywidgets as iw
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xml.etree.ElementTree as et
from sklearn.model_selection import train_test_split

In [2]:
import usure.common.logging as usurelogging
from usure.config import config
from usure.classification.infrastructure import BasicSentenceCleaner

Using TensorFlow backend.


In [4]:
%%javascript
IPython.OutputArea.prototype._should_scroll = lines => false

<IPython.core.display.Javascript object>

In [5]:
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_colwidth', 200)

In [6]:
def display_classes_and_total(df, description):
    display(Markdown(f'### {description}'))
    display(df.groupby(['polarity']).count())
    print(f"Total: {len(df)}")
    
def save_data_frame(df, name):
    df.to_csv(join(config.sets, f"{name}.csv"))

In [7]:
column_names = ["id", "text", "polarity"]

In [8]:
#https://stackoverflow.com/questions/17071871/select-rows-from-a-dataframe-based-on-values-in-a-column-in-pandas
#All
intertass_cr = pd.read_excel(join(config.classification, "country_CR_InterTASS.xls"),
                             usecols = ["id_str", "text", "polarity"],
                             sheet_name="tweets")
intertass_cr.columns = column_names
intertass_cr.set_index("id", inplace=True)
#train
intertass_cr_xml = et.parse(join(config.classification, "intertass-CR-train-tagged.xml"))
intertass_cr_xml_root = intertass_cr_xml.getroot()
intertass_cr_xml_dic = {
    "tweetid" : [element.text for element in intertass_cr_xml_root.findall("./tweet/tweetid")],
    "content" : [element.text for element in intertass_cr_xml_root.findall("./tweet/content")],
    "value" : [element.text for element in intertass_cr_xml_root.findall("./tweet/sentiment/polarity/value")]
}
intertass_cr_train = pd.DataFrame(intertass_cr_xml_dic)
intertass_cr_train.columns = column_names
intertass_cr_train.id = intertass_cr_train.id.astype('int64') 
intertass_cr_train.set_index("id", inplace=True)
#test
intertass_cr_test_xml = et.parse(join(config.classification, "intertass-CR-test.xml"))
intertass_cr_test_xml_root = intertass_cr_test_xml.getroot()
intertass_cr_test_xml_dic = {
    "tweetid" : [element.text for element in intertass_cr_test_xml_root.findall("./tweet/tweetid")],
    "content" : [element.text for element in intertass_cr_test_xml_root.findall("./tweet/content")],
    "value" : [element.text for element in intertass_cr_test_xml_root.findall("./tweet/sentiment/polarity/value")]
}
intertass_cr_test = pd.DataFrame(intertass_cr_test_xml_dic)
intertass_cr_test.columns = column_names
intertass_cr_test.id = intertass_cr_test.id.astype('int64') 
intertass_cr_test.set_index("id", inplace=True)

#display
display(Markdown(f'### Whole CR dataset'))
display(intertass_cr.groupby(['polarity']).count())
print(f"Total: {len(intertass_cr)}")
display(Markdown(f'### CR train dataset'))
display(intertass_cr_train.groupby(['polarity']).count())
print(f"Total: {len(intertass_cr_train)}")
display(Markdown(f'### CR test dataset'))
display(intertass_cr_test.groupby(['polarity']).count())
print(f"Total: {len(intertass_cr_test)}")

### Whole CR dataset

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
N,912
NEU,297
NONE,447
P,677


Total: 2333


### CR train dataset

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
N,311
NEU,94
NONE,165
P,230


Total: 800


### CR test dataset

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1


Total: 1233


In [8]:
train_not_in_all_set = intertass_cr_train.loc[~intertass_cr_train.index.isin(intertass_cr.index)]

test_not_in_all_set = intertass_cr_test.loc[~intertass_cr_test.index.isin(intertass_cr.index)]

In [9]:
shuffled_cr_test = intertass_cr.sample(frac=1)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(shuffled_cr_test["text"], 
                                                    shuffled_cr_test["polarity"], 
                                                    test_size=.5,
                                                    random_state=None,
                                                    shuffle=True,
                                                    stratify = shuffled_cr_test["polarity"])

In [11]:
X = pd.concat([X_train, y_train], axis=1, join='inner', ignore_index=False)
y = pd.concat([X_test, y_test], axis=1, join='inner', ignore_index=False)

In [12]:
display_classes_and_total(X, "New CR train set")
display_classes_and_total(y, "New CR test set")

### New CR train set

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
N,456
NEU,149
NONE,223
P,338


Total: 1166


### New CR test set

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
N,456
NEU,148
NONE,224
P,339


Total: 1167


In [18]:
save_data_frame(X, "train")
save_data_frame(y, "test")

In [16]:
X

Unnamed: 0_level_0,text,polarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1
792469210567888896,1 minuto de silencio por los que trabajamos en horario de Mall y no podemos ver el clásico,N
815626648443830273,"Lo bueno es que desayuné al fin comida descente, y todos los ejercicios me han dado.",P
814711564150444032,"@aniu96 @Chuz_CM Yo no puedo ingerir bebidas alcoholicas, ni marihuana, ni tabaco. Pero yo los acompaño.",NONE
803994481938853889,"@bichoclarke leí el post equivocado aparentemente, me salieron los diseñadores snobs en mi feed",N
766318704288202753,"La verdad, es que estoy en mis mejores tiempos",P
807261162358394882,A mí en serio esas cosas de ganar no se me dan,N
766097943954857984,Extraño esas conversaciones largas con Vanessa,N
787984763713826816,Entró a trabajar a la 1:30 p.m y yo en San Carlos Mi tío cree que voy a llegar puntual.,NONE
782808577471713281,@laufer4 le hiciste la noche a Ángel de Brito y la previa de la siguiente,P
808122577818439688,Que triste cuando uno anda en una actividad y no tiene con quien bailar me veré muy rara bachateando sola en media pista ?,N


In [25]:
intertass_cr["length"] = intertass_cr["text"].apply(lambda text: len(str.split(text)))
display(intertass_cr.describe())

Unnamed: 0,length
count,2333.0
mean,14.035148
std,5.68955
min,4.0
25%,9.0
50%,13.0
75%,18.0
max,33.0


In [54]:
cleaner = BasicSentenceCleaner(config.assets)
intertass_cr_preprocecessed["text"] =  intertass_cr["text"].apply(cleaner.clean)

In [55]:
intertass_cr_preprocecessed["length"] = intertass_cr_preprocecessed["text"].apply(lambda text: len(str.split(text)))

In [56]:
intertass_cr_preprocecessed["length"].describe()

count    2333.000000
mean        5.096014
std         2.401266
min         0.000000
25%         3.000000
50%         5.000000
75%         7.000000
max        15.000000
Name: text, dtype: float64