<a href="https://colab.research.google.com/github/eruiz1996/Diplomado/blob/main/spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [None]:
def plot_confusion_matrix(cm, labels):
    fig_cm = px.imshow(cm, labels=dict(x="Predicted", y="Actual", color="Count"),
                       x=labels, y=labels, color_continuous_scale='Viridis', text_auto = True,
                       title="Confusion Matrix")
    fig_cm.update_layout(coloraxis_showscale=False)
    fig_cm.show()

In [None]:
breast_cancer = load_breast_cancer()
df = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
df['target'] = breast_cancer.target

In [None]:
X = breast_cancer.data  # Features
y = breast_cancer.target  # Labels

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['target'], test_size=0.2, random_state=7)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=3)
X_train_pca_3d = pca.fit_transform(X_train_scaled)
X_test_pca_3d = pca.transform(X_test_scaled)

In [None]:
lr_pca_3d = LogisticRegression()
lr_pca_3d.fit(X_train_pca_3d, y_train)
lr_pca_3d_pred = lr_pca_3d.predict(X_test_pca_3d)

In [None]:
lr_pca_3d_accuracy = accuracy_score(y_test, lr_pca_3d_pred)
lr_pca_3d_precision = precision_score(y_test, lr_pca_3d_pred)
lr_pca_3d_recall = recall_score(y_test, lr_pca_3d_pred)
lr_pca_3d_f1 = f1_score(y_test, lr_pca_3d_pred)
lr_pca_3d_report = classification_report(y_test, lr_pca_3d_pred)
print("Logistic Regression PCA 3D Classification Report:")
print(lr_pca_3d_report)

Logistic Regression PCA 3D Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.80      0.88        40
           1       0.90      0.99      0.94        74

    accuracy                           0.92       114
   macro avg       0.94      0.89      0.91       114
weighted avg       0.93      0.92      0.92       114



In [None]:
lr_pca_3d_accuracy

0.9210526315789473

# Ejercicio spam

In [None]:
df = pd.read_csv('/content/spam.csv', sep = ',', encoding = 'latin-1')
df.head()

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# copia del dataframe
new_df = df.copy()

Creamos función para limpiar todo lo que no nos aporte.

In [None]:
import re

def clean_text(texto):
    # Utilizamos una expresión regular para filtrar las letras del abecedario y el símbolo "'"
    patron = re.compile(r"[a-zA-Z' ]")
    letras_filtradas = re.findall(patron, texto)
    return ''.join(letras_filtradas)

In [None]:
# ejemplos
print(clean_text("it's"))
print(clean_text("Hello"))
print(clean_text("\\Hello67"))

it's
Hello
Hello


La plicamos a la columna de `text`.

In [None]:
new_df['text'] = new_df['text'].apply(clean_text)
new_df['text'].head()

0    Go until jurong point crazy Available only in ...
1                              Ok lar Joking wif u oni
2    Free entry in  a wkly comp to win FA Cup final...
3          U dun say so early hor U c already then say
4    Nah I don't think he goes to usf he lives arou...
Name: text, dtype: object

Pasamos todo a minúsculas.

In [None]:
new_df['text'] = new_df['text'].str.lower()
new_df['text'].head()

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in  a wkly comp to win fa cup final...
3          u dun say so early hor u c already then say
4    nah i don't think he goes to usf he lives arou...
Name: text, dtype: object

Cambiamos columna `class`:

* Si es spam toma el valor de `1`.
* Si es ham toma el valor de `0`.

In [None]:
new_df['class'] = [1 if x == 'spam' else 0 for x in new_df['class'].values]
new_df.head()

Unnamed: 0,class,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i don't think he goes to usf he lives arou...


## Creación del conjunto

In [None]:
words = ''
for texto in new_df['text'].values:
  words += f'{texto} '
words



In [None]:
words = words.replace('  ', ' ')
words



Tenemos un buen de palabras...

In [None]:
len(set((words.split())))

8649

Usamos `stopwords`

In [None]:
!pip install nltk

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Función para filtrar las palabras comunes.

In [None]:
def filtrar_palabras_comunes(texto):
    # Obtenemos el conjunto de palabras comunes en inglés
    palabras_comunes = set(stopwords.words('english'))
    # Convertimos el texto en una lista de palabras
    palabras = texto.split()
    # Filtramos las palabras comunes
    palabras_filtradas = [palabra for palabra in palabras if palabra.lower() in palabras_comunes]
    # Convertimos las palabras filtradas en un solo string
    resultado = ' '.join(palabras_filtradas)
    return resultado

In [None]:
set(filtrar_palabras_comunes(words).split())

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'between',
 'both',
 'but',
 'by',
 'can',
 "couldn't",
 'd',
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'more',
 'most',
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'she',
 "she's",
 'should',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'then',
 'there',
 't

# Hash tables

In [None]:
import hashlib

In [None]:
new_df = df.copy()
new_df['class'] = [1 if x == 'spam' else 0 for x in new_df['class'].values]
new_df['text'] = new_df['text'].apply(hash)
new_df.head()

Unnamed: 0,class,text
0,0,-4152373577043376294
1,0,1865295248699944972
2,1,-7044564904077276306
3,0,-4490796201381809338
4,0,-1850247482691708100


In [None]:
X = np.array(new_df['text']).reshape(-1,1)
y = new_df['class']

## Regresión

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.4,
                                                    random_state = 7)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
regression = LogisticRegression()
regression.fit(X_train, y_train)
y_pred = regression.predict(X_test)

In [None]:
logistic_accuracy = accuracy_score(y_test, y_pred)
print(f"Regression Accuracy: {logistic_accuracy}")

Regression Accuracy: 0.4930462090623598


## KNN

In [None]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)

In [None]:
knn_accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Accuracy: {knn_accuracy}")

KNN Accuracy: 0.8591296545536115


## Naive Bayes

In [None]:
naive_model = GaussianNB()
naive_model.fit(X_train, y_train)
y_pred = naive_model.predict(X_test)

In [None]:
naive_accuracy = accuracy_score(y_test, y_pred)
print(f"Naive Bayes Accuracy: {naive_accuracy}")

Naive Bayes Accuracy: 0.8681022880215343


# Con base de Abraham

In [None]:
df = pd.read_csv("/content/sparse_matrix_spam.csv")

In [None]:
df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V1016,V1017,V1018,V1019,V1020,V1021,V1022,V1023,V1024,V1025
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df['V1025'], test_size=0.2, random_state=7)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

In [None]:
lr_pred = lr.predict(X_test_scaled)

In [None]:
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")

Logistic Regression Accuracy: 0.98


In [None]:
df.shape

(5572, 1025)