In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import string
import nltk
import itertools
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, f1_score, roc_auc_score

In [None]:
spam_dataset = pd.read_csv('../Files/spam.csv', encoding = "ISO-8859-1", usecols=[0, 1], names=['Spam', 'Text'],
                           skiprows=1)
spam_dataset['Spam'] = spam_dataset['Spam'].replace(['ham', 'spam'], [0, 1])
spam_dataset

##### Sprawdzamy czy nasze klasy s zbalansowane

In [None]:
print(spam_dataset['Spam'].value_counts(normalize=True))

#### Usuwanie znaków interpunkcyjnych

In [None]:
def remove_puncation(text):
    cleaned = ''.join([word for word in text if word not in string.punctuation])
    return cleaned
spam_dataset['Cleaned_Text'] = spam_dataset['Text'].apply(lambda x: remove_puncation(x))
spam_dataset

##### Zamieniamy wielkie litery na małe

In [None]:
def tokenize(text):

    # Usunięcie wielkich liter
    clean_text = text.lower()

    # Tokenizacja
    tokenized_text = nltk.word_tokenize(clean_text)
    return tokenized_text

spam_dataset['Tokenized_Text'] = spam_dataset['Cleaned_Text'].apply(lambda x: tokenize(x))
spam_dataset

##### Usuwanie stopwords

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    without_stopwords = [word for word in text if word not in stopwords]
    return without_stopwords
spam_dataset['WithoutStop_Text'] = spam_dataset['Tokenized_Text'].apply(lambda x: remove_stopwords(x))
spam_dataset

##### Stemming - usuwanie części wyrazu lub sprowadzenia wyrazu do jego rdzenia lub korzenia

In [None]:
stemmer = nltk.PorterStemmer()
def stemming(text):
    stemmed_words = [stemmer.stem(word) for word in text]
    return stemmed_words
spam_dataset['Stemmed_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: stemming(x))
spam_dataset

##### Lematyzacja - np. 'better' zostanie przekształcone na 'good'

In [None]:
lemmater = nltk.WordNetLemmatizer()
def lemmatizing(text):
    lemmatized_words = [lemmater.lemmatize(word) for word in text]
    return lemmatized_words
spam_dataset['Lemmatized_Text'] = spam_dataset['WithoutStop_Text'].apply(lambda x: lemmatizing(x))
spam_dataset

##### Na podstawie zlematyzowanych maili stwórzmy stringa, który jest połączeniem wszystkich słów dla maili, które są spamem oraz dla maili, które nie są spamem.

In [None]:
words_spam = list(spam_dataset.loc[spam_dataset['Spam']==1, 'Lemmatized_Text'].values)
words_spam = list(itertools.chain.from_iterable(words_spam))
words_spam = ' '.join(words_spam)

words_notspam = list(spam_dataset.loc[spam_dataset['Spam']==0, 'Lemmatized_Text'].values)
words_notspam = list(itertools.chain.from_iterable(words_notspam))
words_notspam = ' '.join(words_notspam)

##### Tworzymy chmurę słów dla spamów

In [None]:
wordcloud = WordCloud().generate(words_spam)
plt.figure(figsize = (12, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Spam')
plt.show()

##### Tworzymy chmurę dla nie-spamów

In [None]:
wordcloud = WordCloud().generate(words_notspam)
plt.figure(figsize = (12, 8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Nie Spam')
plt.show()

### Wektoryzacja TF (Częstotliwość terminów) - IDF (Odwrotność częstotliwości terminów)

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x)))
print(X.shape)
y = spam_dataset['Spam']
print(y.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
clf.score(X, y)

##### Inne metody wektoryzacji tekstu - wyliczanie liczby wystąpień w poszczególnych mailach

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(min_df=0.01, max_df=0.5)  # słowa, które pojawiają się rzadziej niż w 1% maila lub częściej niż w 50% maili są ignorowane
X_count = count.fit_transform(spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x)))
X_count

In [None]:
clf_v2 = RandomForestClassifier(max_depth=2, random_state=0)
clf_v2.fit(X_count, y)
clf_v2.score(X_count, y)

In [None]:
vocabulary = count.vocabulary_
word_frequencies = X_count.sum(axis=0)
# Tworzenie listy wybranych słów wraz z ich liczbami wystąpień
selected_words_with_frequencies = [(word, word_frequencies[0, idx]) for word, idx in vocabulary.items()]
# Sortowanie listy po liczbie wystąpień
selected_words_with_frequencies.sort(key=lambda x: x[1], reverse=True)
# Wyświetlenie wybranych słów wraz z ich liczbami wystąpień
for word, frequency in selected_words_with_frequencies:
    print(f"Word: {word}, Frequency: {frequency}")

##### Teraz sprawdźmy jak wygląda X, na podstawie którego model się uczy, jak i wykonuje prognozę. Przekształćmy go do Numpy array. Pokażmy 5 pierwszych obserwacji.

In [None]:
print(X_count.toarray()[:5])  ## model 1-gramowy

# Zadanie 17_2

##### Podział danych i nauka modelu

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2, stratify=y)

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
f_imp = clf.feature_importances_
important = []

for x in f_imp:
    if x > 0.001:
        important.append(x)

In [None]:
metrics_dataframe = pd.DataFrame(columns = ['Model', 'F1_score', 'AUC'])
metrics_dataframe
models = []
models_names = []
predictions_proba_list = []
def calculate_metrics(model, name, X_checked, y_checked):
    models.append(model)
    models_names.append(name)
    global metrics_dataframe
    predictions = model.predict(X_checked)
    predictions_proba = model.predict_proba(X_checked)
    predictions_proba_list.append(predictions_proba[:,1])

    ############## metryki dla sprawdzanego modelu ################
    # Precision, Recall, F1, Accuracy
    print(classification_report(y_checked, predictions))

    # Confusion matrix
    plt.figure()
    cm = confusion_matrix(y_checked, predictions)
    ax = sns.heatmap(cm, annot=True, cmap='Blues', fmt='.0f')
    ax.set_title('Confusion Matrix\n\n')
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ')
    plt.show()

    f1_metric = f1_score(y_checked, predictions, average='macro')
    #auc_metric = roc_auc_score(y_checked, predictions_proba, average='macro', multi_class='ovo')
    metrics_dataframe = pd.concat([metrics_dataframe, pd.DataFrame({'Model': [name], 'F1_score': [f1_metric]})], ignore_index=True)

    return metrics_dataframe

In [None]:
X = spam_dataset['Lemmatized_Text'].apply(lambda x: ' '.join(x))
y = spam_dataset['Spam']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size=0.2, stratify=y)

In [None]:
forest_v1= Pipeline(steps=[('TF-IDF', TfidfVectorizer(analyzer='word', lowercase=False)),
                           ('MODEL', RandomForestClassifier())])

In [None]:
list(forest_v1.get_params().keys())

In [None]:
import warnings
warnings.filterwarnings('ignore')

params = {
    'TF-IDF__max_df': [0.5, 0.7, 0.9],
    'TF-IDF__min_df': [0.0001, 0.001, 0.002],
    'TF-IDF__ngram_range': [(1,1), (2,2)],
    'MODEL__max_depth': [3, 5, 7],
    'MODEL__n_jobs': [-1, 200, 500, 1000],
    'MODEL__min_samples_leaf': [0.1, 1.0, 10],
    'MODEL__random_state': [0, 42]
}

grid_search = GridSearchCV(forest_v1, params, cv=5, n_jobs=-1, verbose=10, scoring='f1_macro')
# dodałem wyjątek aby zlikwidować warning
try:
    grid_search.fit(X_train, y_train)
except Exception as e:
    print("Błąd podczas grid search:", e)
print('Wybrane hiperparametry: ', grid_search.best_params_)
model_v1 = grid_search.best_estimator_

In [None]:
calculate_metrics(model_v1, 'Random Forest', X_test, y_test)