# Naive Bayers Classifier

In [1]:
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix

#### Preprocesamiento de datos

In [2]:
# importar los datos
# cuando lo importamos por primera vez nos dice que está utf8 - encoding
spam = pd.read_csv('spam.csv', encoding = 'latin-1')
spam.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# eliminamos las columnas que no nos sirven

spam.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'], axis = 1, inplace = True)

# renombramos las columnas
spam = spam.rename(columns = {'v1':'class','v2':'text'})

In [4]:
spam.head(5)

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam.shape

(5572, 2)

In [6]:
# observamos los datos
spam.groupby('class').describe()

# nos muestra las mas frecuentes de cada clase

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Información que podemos sacar de esta tabla:
* Hay 5572 observaciones y 747 son spam --> el 15% d las observaciones son spam
* Hay mensajes duplicados 



Veamos la longitud de cada mensaje para si está correlado con su clasificación:

In [7]:
# Para hacer un histograma necesitamos las frecuencias 

# creamos columna de longitud
spam['length'] = spam['text'].apply(len)

# histograma 
spam.hist(column = 'length',by='class', bins=50, figsize=(15,6))


array([<matplotlib.axes._subplots.AxesSubplot object at 0x000002183A783128>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x000002183A854C88>],
      dtype=object)

Conclusiones:
* Vemos que la mayoría de los mensajes 'ham' tienen poca longitud (debajo de 200)
* Mientras que la mayoria de los spam tienen de longitud entre 130 y 160.

#### Tokenizar

In [22]:
# 1. Eliminar signos de puntuación
# 2. Eliminar stopwords
# 3. Output: lista de cadenas de textos limpias


def process_text(text):
    #1
    non_puc = []
    for character in text:
        if character not in string.punctuation:
            non_puc.append(character)
    non_puc = ''.join(non_puc)
    
    # 2
    no_stopwords = []
    for word in non_puc.split():
        if word.lower not in stopwords.words('english'):
            no_stopwords.append(word)
    
    # 3
    return no_stopwords
    
    

In [23]:
# vemos si funciona:
spam['text'].apply(process_text).head()

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, dont, think, he, goes, to, usf, he, l...
Name: text, dtype: object

#### Creación del modelo

In [24]:
spam_train, spam_test, class_train, class_test = train_test_split(spam['text'],spam['class'], test_size = 0.2)

In [25]:
# pipeline = tubería
# ¿Qué es lo que hace?. lo que hace es pasar 'el texto a traves de una tubería'
# y hacer los pasos 


pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=process_text)), # converts strings to integer counts
    ('tfidf',TfidfTransformer()), # converts integer counts to weighted TF-IDF scores
    ('classifier',MultinomialNB()) # train on TF-IDF vectors with Naive Bayes classifier
])

In [26]:
# ya hecho el clasificador (Multinomial NB)

# entrenamos el modelo
pipeline.fit(spam_train, class_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function process_text at 0x000001B560292BF8>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocesso...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
# predicciones

predicciones = pipeline.predict(spam_test)

In [32]:

# veamos qué precisión tiene nuestro modelo: con el comando
# classification_report

print(classification_report(class_test, predicciones))

             precision    recall  f1-score   support

        ham       0.94      1.00      0.97       956
       spam       1.00      0.60      0.75       159

avg / total       0.95      0.94      0.94      1115

