In [276]:

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import nltk
import emoji

from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go


from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.tree import plot_tree
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import eli5



pd.set_option('display.max_colwidth', None)



plt.style.use('default') # haciendo los graficos un poco mas bonitos en matplotlib
plt.rcParams['figure.figsize'] = (20, 10)
sns.set(style="whitegrid") # seteando tipo de grid en seaborn
pd.options.display.float_format = '{:20,.10f}'.format # suprimimos la notacion cientifica en los outputs

SMALL_SIZE = 8
MEDIUM_SIZE = 11
BIGGER_SIZE = 14

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


from IPython.core.pylabtools import figsize
plt.rcParams['figure.figsize'] = (9, 6)


# Funciones Generales

In [277]:

NOTICIEROS = ['cnn', 'bbc', 'reuters', 'blomberg', 'herald', 'today', 'herald', 'abc', 'fox', 'cbs', \
            'yorker', 'daily', 'financial', 'guardian', 'sun', 'observer', 'five', 'jazeera', 'news', 'live']

PAIS_DESASTRES = ['usa', 'united states', 'canada', 'island', 'japan', 'nigeria', 'turkey', 'saudi', \
                  'malaysia']

CIUDAD_DESASTRES = ['california', 'new york', 'nyc', 'hawaii', 'hiroshima', 'fukushima', 'calgary']
 
DESASTRES_CONOCIDOS = ['mh370', 'air', 'huracan', 'fire', 'water', 'kill', 'disaster', 'suicide', 'police', 'people', \
                      'flood', 'police', 'home', 'family', 'train', 'storm', 'building', 'crash', 'hiroshima', \
                      'bomb', 'isis', 'bombing', 'wreckage', 'earthquake', 'wild', 'wildfire', 'weather', \
                      'violent', 'attack', 'tornado', 'traffic', 'thunder', 'terror', 'suspect', 'swallow',\
                      'sin', 'severe', 'security', 'refugee', 'refugio', 'release', 'report', 'rescue', 'plane',\
                      'passenger', 'offic', 'nuclear', 'north', 'natural', 'national', 'murder', 'migrant', \
                      'mass', 'land', 'issue', 'hurricane', 'injured', 'hostage', 'flood', 'evacua']
ZONAS_CONFLICTO = ['israel', 'iran', 'turkey']




CORTE_LENGHT_TEXT = 100

#Imprimir metricas de las predicciones
def mostrar_metricas(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(confusion_matrix (y_test, y_pred))

    # Obtener y reformar la matriz de datos de 
    matrix = confusion_matrix (y_test, y_pred) 
    matrix = matrix.astype ('float') / matrix.sum (axis = 1) [:, np.newaxis] 

    # Build the plot
    plt.figure()
    sns.set(font_scale=1.4)
    sns.heatmap(matrix, annot=True, annot_kws={'size':10},cmap=plt.cm.Greens, linewidths=0.2)

    plt.xlabel('Predicción')
    plt.ylabel('Target')
    plt.title('Matriz de confusión')
    plt.show()
    
def grafico_distr(df, columna, titulo, xtitulo, ytitulo):
    x1 = df.loc[df['target'] == 1][columna]
    x2 = df.loc[df['target'] == 0][columna]
    group_labels = ['Verdadero', 'Falso']
    colors = ['rgb(0, 0, 100)', 'rgb(0, 200, 200)']
    fig = ff.create_distplot([x1, x2], group_labels,colors=colors)
    fig.update_layout(title_text=titulo,
                      xaxis_title=xtitulo,
                      yaxis_title=ytitulo,
                      template="plotly_white")
    fig.show()

# Referencia a un diario o noticiero
def news_ref(text):
    for word in NOTICIEROS:
        if word in text:
            return True
    return False

#Categoría ancho mensajes
def text_lenght_category(text):
    if len(text) > CORTE_LENGHT_TEXT:
        return True
    else:
        return False
def desastre_conocido(text):
    for dis in DESASTRES_CONOCIDOS:
        if dis in text:
            return True
    return False

#Pais de desastre
def es_pais_de_desastres(text):
    for word in text.split():
        if word in PAIS_DESASTRES:
            return True
    return False

#Ciudad de desastre
def es_ciudad_de_desastres(text):
    for word in text.split():
        if word in CIUDAD_DESASTRES:
            return True
    return False

def resultados(pred, test_df):
    res_df=pd.DataFrame(test_df['id'])
    res_df['target']=pred
    res_df.to_csv('data/submission.csv', index=False)

In [278]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.corpus import brown
from nltk.tag import pos_tag
from textblob import TextBlob

import re, string
from string import punctuation


stops = set(stopwords.words('english'))
punctuation = list(string.punctuation)
#punctuation = '!"$%&\'()*+,-./:;<=>?@[\\]“”^_`{|}~’'


word_list = brown.words()
word_set = set(word_list)

def eliminar_char_espec(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    return text
def idioma(text):
    tb = TextBlob(text)
    return tb.detect_language()

def sentimiento(text):
    tb = TextBlob(text)
    return tb.sentiment[0]

def unir_texto(text):
    return ( ' '.join(text))

def eliminar_palabras_con(text, con):
    palabras = []
    for word in text.split():
        if con not in word:
            palabras.append(word)
    return unir_texto(palabras)


# Tipo de palabra
#https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

#Lematizar palabras
englishStemmer=SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

#Elimino stops words 
def eliminar_sw(text):
    final_text = []
    for i in text.split():
        if i.strip() not in stops: 
            pos = pos_tag([i.strip()])
            #word = lemmatizer.lemmatize(i.strip(),get_simple_pos(pos[0][1]))
            final_text.append(i.strip())
    return unir_texto(final_text)
def cant_stopwords(text):
    cant = 0
    for i in text.split():
        if i.strip() in stops: 
            cant = cant + 1
    return cant
def minusculas(text):
    return text.lower()

def eliminar_nums(text):
    cadena = []
    for x in text:
         if x not in string.digits:
                cadena.append(x)
    return  ''.join(cadena)

def eliminar_punct(text):
    cadena = []
    for x in text:
         if x not in string.punctuation:
                cadena.append(x)
    return  ''.join(cadena)
        
#Generar CSV para el envio
def formatear_texto(text):
    text = minusculas(text)
    text = eliminar_palabras_con(text, 'http')
    text = eliminar_palabras_con(text, '@')
    #text = eliminar_palabras_con(text, '#')
    #text = eliminar_sw(text)
    text = eliminar_punct(text)
    text = eliminar_nums(text)
    text = eliminar_char_espec(text)
    return text.strip()
def eliminar_palabras_especiales(text):
    text = eliminar_palabras_con(text, 'http')
    text = eliminar_palabras_con(text, '@')
    #text = eliminar_palabras_con(text, '#')
    return text

def cant_err_ortograficos(text):
    cant = 0
    for w in text.split():
        if w not in word_set:
            cant = cant + 1
    return cant
def eliminar_cortas(text):
    cadena = []
    for word in text.split():
        if len(word) > 1:
            cadena.append(word)
    return  ' '.join(cadena)

def convertir_relevantes(text):
    #text = text.replace('\'','')
    text = text.replace('&amp', 'and')
    text = text.replace('?',' ')
    text = text.replace('-',' ')
    text = text.replace('iam','i am')
    text = text.replace('i\'m','i am')
    text = text.replace(' im ','i am')
    text = text.replace('cant','can not')
    text = text.replace(' u ',' you ')
    text = text.replace(' youre ',' you are ')
    text = text.replace(' lol ','')
    text = text.replace('rt ','')
    text = text.replace(' ive ',' i have ')
    text = text.replace(' hes ',' hi is ')
    text = text.replace(' thats ',' that is ')
    text = text.replace(' ill ',' i will ')
    text = text.replace(' yd ',' years ')
    text = text.replace('it s','its')
    text = text.replace('fvck','fuck')
    text = text.replace('pm','')
    text = text.replace(' rn ',' right now ')
    text = text.replace(' 3p ',' ')
    text = text.replace(' n ',' ')
    text = text.replace(' da ',' the ')
    text = text.replace('i ve','i have')
    return text



In [279]:
def formato_inicial(df):
    df['location'].fillna(value='sin location', inplace=True)
    df['keyword'].fillna(value='sin keyword', inplace=True)
    
    #Minúsculas
    df['keyword'] = df.keyword.str.lower()
    df['location'] = df.keyword.str.lower()
    df['keyword'] = df.keyword.str.replace('%20', ' ')
    df['location'] = df.location.str.replace('%20', ' ')
    
    #Sobre text_clean
    df['text_clean'] = df.text.str.replace('%20', ' ')
    df['text_clean'] = df.text_clean.apply(eliminar_palabras_especiales)


    
def nuevas_columnas(df):
    #Sobre el texto original
    df['cant_caracteres'] = df.text.str.len()
    
    #Sobre el texto formateado
    df['cant_stopwords'] = df.text_clean.apply(cant_stopwords)
    df['cant_palabras'] = df.text_clean.str.split().str.len()
    df['cant_mayus'] = df.text_clean.str.count(r'[A-Z]')
    df['text_clean'] = df.text_clean.str.lower()

    
    #Sobre lo calculado
    df['cant_stopwords_pct'] = df.cant_stopwords/df.cant_palabras
    df['cant_mayus_pct'] = df.cant_mayus/df.cant_caracteres

    
    #Sobre el léxico
    train_df['sentimiento'] = train_df.text_clean.apply(sentimiento)


In [280]:

'''
train_df['text_sin_sw'] = train_df.text.apply(convertir_relevantes)
train_df['text_sin_sw'] = train_df.text_sin_sw.apply(formatear_texto)
train_df['text_sin_sw'] = train_df.text_sin_sw.apply(eliminar_cortas)


train_df['keyword'] = train_df.keyword.str.lower()
train_df['keyword'] = train_df.keyword.str.replace('%20', ' ')


train_df['location'] = train_df.location.str.lower()
train_df['location'] = train_df.location.str.replace('%20', ' ')


train_df['word_count'] = train_df.text_sin_sw.str.split().str.len()
train_df['word_lenght'] = train_df.text.apply(len)

train_df['cant_stop_words'] = train_df.text_sin_sw.apply(cant_stopwords)
#train_df['cant_err_orto'] =train_df.text_sin_sw.apply(cant_err_ortograficos)

train_df['http'] = train_df.text.str.contains('http')
train_df['ref_noticias'] = train_df.text.apply(news_ref)



#train_df[train_df.target == 0]['location'].str.split(expand=True).stack().value_counts().to_csv('data/sample0.csv')
#train_df[train_df.target == 1]['location'].value_counts().to_csv('data/sample1.csv')
#cols= ['word_lenght']
#df = pd.get_dummies(train_df, columns=cols, drop_first=True)
'''

train_df = pd.read_csv('data/train.csv', encoding='utf-8')
test_df = pd.read_csv('data/test.csv', encoding='utf-8')
formato_inicial(train_df)
nuevas_columnas(train_df)

"\ntrain_df['text_sin_sw'] = train_df.text.apply(convertir_relevantes)\ntrain_df['text_sin_sw'] = train_df.text_sin_sw.apply(formatear_texto)\ntrain_df['text_sin_sw'] = train_df.text_sin_sw.apply(eliminar_cortas)\n\n\ntrain_df['keyword'] = train_df.keyword.str.lower()\ntrain_df['keyword'] = train_df.keyword.str.replace('%20', ' ')\n\n\ntrain_df['location'] = train_df.location.str.lower()\ntrain_df['location'] = train_df.location.str.replace('%20', ' ')\n\n\ntrain_df['word_count'] = train_df.text_sin_sw.str.split().str.len()\ntrain_df['word_lenght'] = train_df.text.apply(len)\n\ntrain_df['cant_stop_words'] = train_df.text_sin_sw.apply(cant_stopwords)\n#train_df['cant_err_orto'] =train_df.text_sin_sw.apply(cant_err_ortograficos)\n\ntrain_df['http'] = train_df.text.str.contains('http')\ntrain_df['ref_noticias'] = train_df.text.apply(news_ref)\n\n\n\n#train_df[train_df.target == 0]['location'].str.split(expand=True).stack().value_counts().to_csv('data/sample0.csv')\n#train_df[train_df.targ

In [281]:
train_df.sample(2)

Unnamed: 0,id,keyword,location,text,target,text_clean,cant_caracteres,cant_stopwords,cant_palabras,cant_mayus,cant_stopwords_pct,cant_mayus_pct,sentimiento
5809,8291,rioting,rioting,@fa07af174a71408 I have lived &amp; my family have lived in countries where looters were shot on sight where rioting wasn't tolerated. Why here,1,i have lived &amp; my family have lived in countries where looters were shot on sight where rioting wasn't tolerated. why here,143,10,22,2,0.4545454545,0.013986014,0.0
2713,3897,detonation,detonation,Ignition Knock (Detonation) Sensor-Senso Standard KS111 http://t.co/NXLEiIJFgS http://t.co/xsGwm5zXPd,0,ignition knock (detonation) sensor-senso standard ks111,101,0,6,8,0.0,0.0792079208,0.0


In [282]:
grafico_distr(train_df, 'cant_palabras', 'Gráfico de distr. de cantidad de palabras en text',  'Cantidad', '')
grafico_distr(train_df, 'cant_caracteres','Gráfico de dist. de cantidad de caracteres en text', 'Cantidad', '')
grafico_distr(train_df, 'cant_stopwords_pct','Gráfico de distr. porcentual de stopwords en text', 'Porcentaje', '')
grafico_distr(train_df, 'cant_mayus_pct', 'Gráfico de distr. porcentual de mayúsculas en text','Porcentaje', '')



In [283]:
#train_df['text_sin_sw'].str.split(expand=True).stack().value_counts().to_csv('data/sample.csv')
#train_df[train_df.text.str.contains('û_')]['text'].count()
#train_df[train_df.text_sin_sw.str.contains('i ve')].count()
#train_df[train_df.target == 0]['text_sin_sw'].str.split(expand=True).stack().value_counts().to_csv('data/sample.csv')


In [284]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objs as go
import chart_studio.plotly as py
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
#InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# Cantidad de palabras a usar
MAX_NB_WORDS = 50000
# Número maximo de palabras en el campo text.
MAX_SEQUENCE_LENGTH = 30
# Fijo.
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df['text_clean'].values)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = tokenizer.texts_to_sequences(train_df['text_clean'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(train_df['target']).values
print('Shape of label tensor:', Y.shape)



X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 0)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


Found 15846 unique tokens.
Shape of data tensor: (7613, 30)
Shape of label tensor: (7613, 2)
(5709, 30) (5709, 2)
(1904, 30) (1904, 2)
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 30, 100)           5000000   
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 30, 100)           0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 202       
Total params: 5,080,602
Trainable params: 5,080,602
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test set
  Loss: 0.688
  Accuracy: 0.794


In [285]:
plt.figure()
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

<IPython.core.display.Javascript object>

In [286]:
plt.figure()
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

<IPython.core.display.Javascript object>

# 0. XGBoost

In [287]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from xgboost import XGBClassifier

X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# transform  to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

alg = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=17,
                    min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                    objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

alg.fit(X_train, y_train, eval_metric='auc')

y_pred = alg.predict(X_test)

#Métricas
mostrar_metricas(y_test, y_pred)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=17,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=27, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=27, subsample=0.6,
              tree_method='exact', validate_parameters=1, verbosity=None)

              precision    recall  f1-score   support

           0       0.77      0.86      0.81      1107
           1       0.77      0.64      0.70       797

    accuracy                           0.77      1904
   macro avg       0.77      0.75      0.75      1904
weighted avg       0.77      0.77      0.76      1904

[[951 156]
 [289 508]]


<IPython.core.display.Javascript object>

# 1. Random Forest Classifier

In [288]:
y = train_df['target']
columnas = ['id', 'keyword', 'location', 'text', 'target', 'text_clean']
X = train_df.drop(columnas, axis=1)



#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


# Train
RFC = RandomForestClassifier(n_estimators = 200,  max_depth=8, max_features='auto',
                             criterion='entropy', random_state = 0)

RFC.fit(X_train, y_train)

#Predict
y_pred = RFC.predict(X_test)

#Métricas
mostrar_metricas(y_test, y_pred)


#Features
features = np.array(X.columns)
importances = RFC.feature_importances_

#Plot
plt.figure()
plt.bar(X_train.columns, importances)
plt.title('Feature Importances')
plt.xlabel('Features')
plt.xticks(rotation=17)
plt.ylabel('Relative Importance')
plt.show()



#Tree
tree0 = RFC.estimators_[0]
fig, axes = plt.subplots(nrows = 1, ncols = 1,  figsize=(9,6), dpi=200)
plot_tree(tree0, feature_names = X_train.columns, filled = True)
plt.show()
'''

#Buscando parámetros 'buenos'
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

#Grid Search train
GSCV = GridSearchCV(estimator=RFC, param_grid=param_grid, cv= 5)
GSCV.fit(X_train, y_train)

# Best paramns
print(GSCV.best_params_)


'''


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=8, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

              precision    recall  f1-score   support

           0       0.70      0.72      0.71      1107
           1       0.59      0.57      0.58       797

    accuracy                           0.66      1904
   macro avg       0.65      0.64      0.64      1904
weighted avg       0.65      0.66      0.66      1904

[[797 310]
 [344 453]]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<BarContainer object of 7 artists>

Text(0.5, 1.0, 'Feature Importances')

Text(0.5, 0, 'Features')

([0, 1, 2, 3, 4, 5, 6], <a list of 7 Text major ticklabel objects>)

Text(0, 0.5, 'Relative Importance')

<IPython.core.display.Javascript object>

[Text(580.7310267857143, 872.6666666666666, 'cant_caracteres <= 64.5\nentropy = 0.986\nsamples = 3588\nvalue = [3253, 2456]'),
 Text(205.51339285714286, 770.0, 'cant_mayus <= 1.5\nentropy = 0.801\nsamples = 620\nvalue = [748, 241]'),
 Text(109.60714285714286, 667.3333333333333, 'sentimiento <= 0.113\nentropy = 0.721\nsamples = 340\nvalue = [437, 109]'),
 Text(59.78571428571429, 564.6666666666666, 'cant_stopwords_pct <= 0.437\nentropy = 0.777\nsamples = 277\nvalue = [343, 102]'),
 Text(26.571428571428573, 462.0, 'cant_caracteres <= 17.5\nentropy = 0.841\nsamples = 220\nvalue = [252, 93]'),
 Text(19.92857142857143, 359.33333333333326, 'entropy = 0.0\nsamples = 14\nvalue = [24, 0]'),
 Text(33.214285714285715, 359.33333333333326, 'cant_mayus_pct <= 0.054\nentropy = 0.868\nsamples = 206\nvalue = [228, 93]'),
 Text(26.571428571428573, 256.66666666666663, 'cant_caracteres <= 40.5\nentropy = 0.86\nsamples = 205\nvalue = [228, 90]'),
 Text(13.285714285714286, 154.0, 'cant_caracteres <= 36.5\nen

"\n\n#Buscando parámetros 'buenos'\nparam_grid = { \n    'n_estimators': [200, 500],\n    'max_features': ['auto', 'sqrt', 'log2'],\n    'max_depth' : [4,5,6,7,8],\n    'criterion' :['gini', 'entropy']\n}\n\n#Grid Search train\nGSCV = GridSearchCV(estimator=RFC, param_grid=param_grid, cv= 5)\nGSCV.fit(X_train, y_train)\n\n# Best paramns\nprint(GSCV.best_params_)\n\n\n"

### Random Forest Classifier(TFIDF)

In [289]:
X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# transform  to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)


# Train
RFC = RandomForestClassifier(n_estimators = 500,  max_depth=25, max_features='auto',
                             criterion='entropy', n_jobs=-1)


RFC.fit(X_train, y_train)

#Predict
y_pred = RFC.predict(X_test)

#Métricas
mostrar_metricas(y_test, y_pred)

'''

#Buscando parámetros 'buenos'
param_grid = { 
    'n_estimators': [100, 600],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,5,7,9,11, 13],
    'criterion' :['gini', 'entropy']
}

#Grid Search train
GSCV = GridSearchCV(estimator=RFC, param_grid=param_grid, cv= 5)
GSCV.fit(X_train, y_train)

# Best paramns
print(GSCV.best_params_)
'''


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=25, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

              precision    recall  f1-score   support

           0       0.69      0.99      0.81      1107
           1       0.97      0.38      0.55       797

    accuracy                           0.74      1904
   macro avg       0.83      0.69      0.68      1904
weighted avg       0.81      0.74      0.70      1904

[[1096   11]
 [ 491  306]]


<IPython.core.display.Javascript object>

"\n\n#Buscando parámetros 'buenos'\nparam_grid = { \n    'n_estimators': [100, 600],\n    'max_features': ['auto', 'sqrt', 'log2'],\n    'max_depth' : [3,5,7,9,11, 13],\n    'criterion' :['gini', 'entropy']\n}\n\n#Grid Search train\nGSCV = GridSearchCV(estimator=RFC, param_grid=param_grid, cv= 5)\nGSCV.fit(X_train, y_train)\n\n# Best paramns\nprint(GSCV.best_params_)\n"

# 2. Regresión Logística

In [290]:
X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# transform  to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Entrenamiento
LR = LogisticRegression(C=0.2, penalty= 'l2', multi_class='multinomial', n_jobs=4)
LR.fit(X_train, y_train)


#Métricas
y_pred = LR.predict(X_test)
mostrar_metricas(y_test, y_pred)

#Feature estimator
eli5.show_weights(estimator=LR,feature_names= list(count_vect.get_feature_names()),top=(20, 20))



LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=4, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

              precision    recall  f1-score   support

           0       0.78      0.92      0.85      1107
           1       0.85      0.64      0.73       797

    accuracy                           0.80      1904
   macro avg       0.82      0.78      0.79      1904
weighted avg       0.81      0.80      0.80      1904

[[1017   90]
 [ 283  514]]


<IPython.core.display.Javascript object>

Weight?,Feature
+1.477,in
+1.010,california
+0.968,fires
+0.968,hiroshima
+0.805,of
+0.771,fire
+0.766,suicide
+0.713,after
+0.701,train
+0.682,killed


# 3. KNN

In [291]:


X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# transform  to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)

#Train
KNN = KNeighborsClassifier(n_neighbors = 80, metric='minkowski')
KNN.fit(X_train, y_train)

#Métricas
y_pred = KNN.predict(X_test)
mostrar_metricas(y_test, y_pred)

'''

#Buscando un K 'bueno'
k_range = range(1,100, 3)
scores = []


for k in k_range:
    KNN = KNeighborsClassifier(n_neighbors = k, metric='minkowski')
    KNN.fit(X_train, y_train)
    scores.append(KNN.score(X_test, y_test))
   # accuracy = metrics.accuracy_score(y_test, y_pred) #Accuracy

plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores)
plt.show()

'''

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=80, p=2,
                     weights='uniform')

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1107
           1       0.85      0.62      0.72       797

    accuracy                           0.79      1904
   macro avg       0.81      0.77      0.78      1904
weighted avg       0.80      0.79      0.79      1904

[[1017   90]
 [ 303  494]]


<IPython.core.display.Javascript object>

"\n\n#Buscando un K 'bueno'\nk_range = range(1,100, 3)\nscores = []\n\n\nfor k in k_range:\n    KNN = KNeighborsClassifier(n_neighbors = k, metric='minkowski')\n    KNN.fit(X_train, y_train)\n    scores.append(KNN.score(X_test, y_test))\n   # accuracy = metrics.accuracy_score(y_test, y_pred) #Accuracy\n\nplt.figure()\nplt.xlabel('k')\nplt.ylabel('accuracy')\nplt.scatter(k_range, scores)\nplt.show()\n\n"

# 4. Multinomial NB

In [292]:


X = train_df['text_clean']
y = train_df['target']

#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Doc vs Term
count_vect = CountVectorizer()
X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

# transform  to a normalized tf-idf representation 
tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(X_train)
X_test = tfidf_transformer.transform(X_test)


# Train
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

#Predict
y_pred_gnb = MNB.predict(X_test)

#Métricas
mostrar_metricas(y_test, y_pred)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1107
           1       0.85      0.62      0.72       797

    accuracy                           0.79      1904
   macro avg       0.81      0.77      0.78      1904
weighted avg       0.80      0.79      0.79      1904

[[1017   90]
 [ 303  494]]


<IPython.core.display.Javascript object>