In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Carga de los datos

df_test = pd.read_csv("./new_classification_test.csv", sep = ",")
df_train = pd.read_csv("./new_classification_train.csv", sep = ",")

In [None]:
# Mostramos dataset de test
df_test

In [None]:
# Mostramos dataset de train
df_train

Como podemos ver, los datos de train tienen una columna más respecto a los datos de test. Esta columna es la de la variable a predecir (category)

In [None]:
df_train.describe()

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df_train)

profile

Como podemos ver, el dataset está formado de 134.571 observaciones. 
En total, hay 37.743 celdas sin datos, por lo que vamos a tener que tener esto en cuenta.

La primera columna es un id, por lo que la podemos borrar, ya que no aporta nada de información, debido a que es un índice.

La segunda columna, es el headline. Hay 133.726 datos distintos y 3 faltantes, por lo que podemos concluir que hay algunos headlines repetidos en el dataset.

Algo similar ocurre con la columna authors. Solo el 20% de los datos son distintos, por lo que hay autores que se repiten. Esto puede ser útil porque tal vez un autor tenga preferencia a escribir noticias de una categoría en particular. Además, un dato no menor es que faltan 24.477 datos de autores. 

En cuanto a la categoria short_description, podemos observar que en su gran mayoría son distintos, pero hay algunas descripciones que se repiten. Además, también hay un gran número de valores faltantes (13.263).

De la columna date podemos decir que hay muchos datos repetidos, ya que solo e 1.7% son valores distintos. Además, no falta ningún valor.

Por último, la columna category, es la de la variable a predecir, Hay 41 categorías distintas en todo el set de datos. 



In [None]:
# Llenamos datos faltantes:
df_train['headline'].fillna('Unknown headline', inplace=True)
df_test['headline'].fillna('Unknown headline', inplace=True)

In [None]:
df_train['short_description'].fillna('Unknown short description', inplace=True)
df_test['short_description'].fillna('Unknown short description', inplace=True)

In [None]:
df_train['authors'].fillna('Unknown authors', inplace=True)
df_test['authors'].fillna('Unknown authors', inplace=True)

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df_train)

profile

In [None]:
df_train

In [None]:
# Eliminamos columna de id
# Quitamos la columna id del dataframe porque vimos que era un id autoincremental, el cual era lo mismo que el índice de la fila, por lo tanto, no aportaba información.
df_train.drop(columns=['id'], axis=1, inplace=True)
df_test.drop(columns=['id'], axis=1, inplace=True)

In [None]:
df_train

In [None]:
# Cambiamos el tipo de los datos a string, ya que antes eran de tipo object, pero en realidad, todos deberían ser strings
df_train[['headline', 'authors', 'short_description', 'category', 'date']] = df_train[['headline', 'authors', 'short_description', 'category', 'date']].astype('string')
df_test[['headline', 'authors', 'short_description', 'date']] = df_test[['headline', 'authors', 'short_description', 'date']].astype('string')

In [None]:
df_train.info()

In [None]:
# Hay 2309 fechas distintas en el dataset
df_train['date'].unique()

In [None]:
# Hay 41 categorias distintas en el dataset
df_train['category'].unique()

In [None]:
plt.figure(figsize=(80,40))
df_train['category'].hist()

In [None]:
#Countplot por categoria
plt.figure(figsize=(50,5))
sns.countplot(data=df_train, x='category')

Como podemos observar, la mayoría de noticias son de la categoría política

El histograma nos da una noción global de como es la distribución de "category". Los valores más frecuentes son:

1 - Politics (21935)

2 - Wellness (11944)

3 - Entertainment (10759)

## VECTORIZACIÓN

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

### Vectorización de headlines

In [None]:
headlines_corpus = list(df_train['headline'])
headlines_corpus_test = list(df_test['headline'])

headline_vectorizer = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True, max_features=1000, max_df=0.80).fit(headlines_corpus)
headline_vectorizer_test = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True, max_features=1000, max_df=0.80).fit(headlines_corpus_test)

In [None]:
headline_vectorizer.get_feature_names()

In [None]:
len(headline_vectorizer.get_feature_names())

In [None]:
X_train_headlines = headline_vectorizer.transform(headlines_corpus)
X_test_headlines = headline_vectorizer_test.transform(headlines_corpus_test)
print(X_train_headlines.toarray())

### Vectorización de short_description

In [None]:
short_description_corpus = list(df_train['short_description'])
short_description_corpus_test = list(df_test['short_description'])

shortDescription_vectorizer = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True,max_features=1000, max_df=0.80, ).fit(short_description_corpus)
shortDescription_vectorizer_test = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True,max_features=1000, max_df=0.80, ).fit(short_description_corpus_test)

In [None]:
shortDescription_vectorizer.get_feature_names()

In [None]:
len(shortDescription_vectorizer.get_feature_names())

In [None]:
X_train_short_description = shortDescription_vectorizer.transform(short_description_corpus)
X_test_short_description = shortDescription_vectorizer_test.transform(short_description_corpus_test)
print(X_train_short_description.toarray())

### Vectorización de authors

In [None]:
authors_corpus = list(df_train['authors'])
authors_corpus_test = list(df_test['authors'])

authors_vectorizer = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True, max_features=1000, max_df=0.80).fit(authors_corpus)
authors_vectorizer_test = CountVectorizer(stop_words='english', dtype=np.int32, lowercase=True, max_features=1000, max_df=0.80).fit(authors_corpus_test)

In [None]:
authors_vectorizer.get_feature_names()

In [None]:
len(authors_vectorizer.get_feature_names())

In [None]:
X_train_authors = authors_vectorizer.transform(authors_corpus)
X_test_authors = authors_vectorizer_test.transform(authors_corpus_test)
print(X_train_authors.toarray())

### Ahora concatenamos los vectores de headlines, short_description y authors

In [None]:
# TRAIN
headline_feature_names = ['headline' + elem for elem in headline_vectorizer.get_feature_names()]
shortDescription_feature_names = ['shortDescription' + elem for elem in shortDescription_vectorizer.get_feature_names()]
authors_feature_names = ['authors' + elem for elem in authors_vectorizer.get_feature_names()]
dfHeadline = pd.DataFrame.sparse.from_spmatrix(X_train_headlines, columns=headline_feature_names)
dfShortDescription = pd.DataFrame.sparse.from_spmatrix(X_train_short_description, columns=shortDescription_feature_names)
dfAuthors = pd.DataFrame.sparse.from_spmatrix(X_train_authors, columns=authors_feature_names)

In [None]:
# TEST
headline_feature_names_test = ['headline' + elem for elem in headline_vectorizer_test.get_feature_names()]
shortDescription_feature_names_test = ['shortDescription' + elem for elem in shortDescription_vectorizer_test.get_feature_names()]
authors_feature_names_test = ['authors' + elem for elem in authors_vectorizer_test.get_feature_names()]
dfHeadline_test = pd.DataFrame.sparse.from_spmatrix(X_test_headlines, columns=headline_feature_names_test)
dfShortDescription_test = pd.DataFrame.sparse.from_spmatrix(X_test_short_description, columns=shortDescription_feature_names_test)
dfAuthors_test = pd.DataFrame.sparse.from_spmatrix(X_test_authors, columns=authors_feature_names_test)

In [None]:
dfHeadline

In [None]:
dfShortDescription

In [None]:
dfAuthors

In [None]:
df_train = pd.concat([df_train, dfHeadline, dfShortDescription, dfAuthors], axis=1)
df_test = pd.concat([df_test, dfHeadline_test, dfShortDescription_test, dfAuthors_test], axis=1)

In [None]:
df_train

In [None]:
# Eliminamos columnas de headline, short_description y authors
df_train.drop(columns=['headline', 'short_description', 'authors'], inplace=True)
df_test.drop(columns=['headline', 'short_description', 'authors'], inplace=True)

In [None]:
df_train

## Label Encoders

In [None]:
from sklearn.preprocessing import LabelEncoder

### Label encoder para "date"


In [None]:
dateLE = LabelEncoder()

In [None]:
df_train['date'] = dateLE.fit_transform(df_train['date'])
df_test['date'] = dateLE.fit_transform(df_test['date'])

In [None]:
df_train

### One-hot encoding para "category"

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
categoryOHE = OneHotEncoder(handle_unknown='ignore')

In [None]:
df_train['category'].unique()

In [None]:
categories = df_train['category'].to_numpy()
categories = categories.reshape(-1,1)

In [None]:
y = categoryOHE.fit_transform(categories).toarray()
print(y)

In [None]:
categoryOHE.categories_

In [None]:
X = df_train.drop(['category'], axis=1)
X

## ARMADO DE ÁRBOL DE DECISIÓN

### De profundidad máxima 3, criterio Gini

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
arbol = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)

In [None]:
# Dividimos train en train y dev
from sklearn.model_selection import train_test_split

X_train_tree, X_dev_tree, Y_train_tree, Y_devtree = train_test_split(
    df_train.drop(['category'], axis=1),
    y,
    test_size=(0.1), random_state=42)

In [None]:
arbol.fit(X_train_tree, Y_train_tree)

In [None]:
Y_pred = arbol.predict(X_dev_tree)
Y_pred

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Accuracy Arbol:", accuracy_score(Y_pred, Y_devtree))

In [None]:
# set to 1 the highest element of an array and the rest to 0
def set_to_one(array):
 newArr = np.zeros(array.shape)
 newArr[array.argmax()] = 1
 return newArr

In [None]:
arr = np.array([0.1,0.2,0.3,0.8,0.5])
set_to_one(arr)

In [None]:
df_train['category'].unique()

In [None]:
# iterate over the matrix and set to 1 the highest element of each row
def set_to_one_row(matrix):
	for i in range(len(matrix)):
		row = matrix[i]
		matrix[i] = set_to_one(row)
	return matrix

In [None]:
def predict_then_save(model, X_test, filename):
    Y_pred = model.predict(X_test)
    print(Y_pred[0])
    Y_pred = set_to_one_row(Y_pred)
    Y_pred_decoded = categoryOHE.inverse_transform(Y_pred).astype(str).reshape(-1)
    with open(filename, 'w') as f:
        f.write('id,category\n')
        for i, pred in enumerate(Y_pred_decoded):
            f.write(str(i) + ',' + str(pred) + '\n')

In [None]:
predict_then_save(arbol, df_test, 'submission_arbol.csv')

## ARMADO DE RED NEURONAL

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout, LeakyReLU
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
def make_NN_model(output_shapes, input_shape, use_bias, activations, dropouts, optimizer, loss, metric):
    model = Sequential()
    layers = len(output_shapes)
    for i in range(0, layers):
        if i == 0:
            model.add(Dense(output_shapes[i], input_shape=(input_shape,), use_bias=use_bias))
        else: 
            model.add(Dense(output_shapes[i], use_bias=use_bias))
            model.add(Activation(activations[i]))
        if dropouts[i] != None:
            model.add(dropouts[i])
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    return model

### Primera red neuronal

In [None]:
model = make_NN_model(
  output_shapes=[25, 35, Y_train_tree.shape[1]],
  input_shape=len(df_train.drop(['category'], axis=1).columns),
  use_bias=True,
  activations=[LeakyReLU(alpha=0.3), LeakyReLU(alpha=0.3), 'softmax'],
  dropouts=[Dropout(0.3), None, None],
  optimizer='Adam',
  loss='categorical_crossentropy',
  metric='accuracy' 
)

In [None]:
early_stopping = EarlyStopping(monitor='accuracy', patience=5)
model.fit(X_train_tree, Y_train_tree, epochs=2000, validation_split=0.2, callbacks=[early_stopping])

In [None]:
Y_pred_model = model.predict(X_dev_tree)
Y_pred_model

In [None]:
Y_pred_model = set_to_one_row(Y_pred_model)

In [None]:
Y_pred_model

In [None]:
print("Accuracy Arbol:", accuracy_score(Y_pred_model, Y_devtree))

### Segunda red neuronal