In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams

# for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier


# for word embedding
import gensim
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
import nltk
nltk.download('punkt_tab')
import nltk
nltk.download('averaged_perceptron_tagger_eng')

          

# Load data
# import os
# os.chdir('/Users/ranivija/Desktop/')
# df_train = pd.read_csv('train.csv')
# print(df_train.shape)
# df_train.head()

# Class distribution
# x = df_train['target'].value_counts()
# print(x)
# sns.barplot(x.index, x)

# Missing values
# df_train.isna().sum()



# Load the CSV file
df_new = pd.read_csv('data/MMCoVaR_News_Dataset.csv')

# Create a DataFrame for original articles with label 1
df_train = df_new[['body_text', 'reliability']].copy()
df_train.columns = ['text', 'target']

# Add the 'word_count' column to the DataFrame

df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))


# Plotting word-count per article
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
train_words = df_train[df_train['target'] == 1]['word_count']
ax1.hist(train_words, color='red')
ax1.set_title('Fake texts')
train_words = df_train[df_train['target'] == 0]['word_count']
ax2.hist(train_words, color='green')
ax2.set_title('Original texsts')
fig.suptitle('Words per tweet')
plt.show()

# Text preprocessing
def preprocess(text):
    text = text.lower().strip()
    text = re.sub('<.*?>', '', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

text = preprocess("This is a sample text for cleaning.")
print(text)

# Stopword removal
def stopword(string):
    a = [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

# Stemming
snow = SnowballStemmer('english')
def stemming(string):
    a = [snow.stem(i) for i in word_tokenize(string)]
    return " ".join(a)

# Lemmatization
wl = WordNetLemmatizer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string))
    a = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags]
    return " ".join(a)

text = lemmatizer(stopword(preprocess("Sample text for lemmatization and cleaning.")))
print(text)

# Final preprocessing function
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))

# Instead of df_train = df_train.drop(columns=['word_count', 'char_count', 'unique_word_count']) pick only the required columns
df_train = df_train[['clean_text', 'target']]
df_train.head()

# Word2Vec model
df_train['clean_text_tok'] = [nltk.word_tokenize(i) for i in df_train['clean_text']]
model = Word2Vec(df_train['clean_text_tok'], min_count=1)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors))

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(next(iter(word2vec.values())))

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
# Generate Bigrams and Trigrams using CountVectorizer
def add_ngrams(tokens, n=3):
    bigrams = list(ngrams(tokens, 2))  # Generate bigrams
    trigrams = list(ngrams(tokens, 3))  # Generate trigrams
    return tokens + ['_'.join(bigram) for bigram in bigrams] + ['_'.join(trigram) for trigram in trigrams]



def analysis_of_cleaned_texts(df, filename, ngrams=False):
    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(df["clean_text"], df["target"], test_size=0.2, shuffle=True)

    # Tokenization with optional n-grams
    if ngrams:
        # Add n-grams to tokenized text
        X_train_tok = [add_ngrams(nltk.word_tokenize(i)) for i in X_train]
        X_val_tok = [add_ngrams(nltk.word_tokenize(i)) for i in X_val]
    else:
        # Regular tokenization
        X_train_tok = [nltk.word_tokenize(i) for i in X_train]
        X_val_tok = [nltk.word_tokenize(i) for i in X_val]

    # TF-IDF
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)
    X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)

    # Word2Vec transformation
    modelw = MeanEmbeddingVectorizer(w2v)
    X_train_vectors_w2v = modelw.transform(X_train_tok)
    X_val_vectors_w2v = modelw.transform(X_val_tok)

    # Logistic Regression with TF-IDF
    lr_tfidf = LogisticRegression(solver='liblinear', C=10, penalty='l1')
    lr_tfidf.fit(X_train_vectors_tfidf, y_train)
    y_predict = lr_tfidf.predict(X_val_vectors_tfidf)
    y_prob = lr_tfidf.predict_proba(X_val_vectors_tfidf)[:, 1]


    print("Logistic Regression with TF-IDF")
    print(classification_report(y_val, y_predict))
    print("Confusion Matrix:", confusion_matrix(y_val, y_predict))
    fpr, tpr, thresholds = roc_curve(y_val, y_prob)
    roc_auc = auc(fpr, tpr)
    print("AUC:", roc_auc)


    # Naive Bayes with TF-IDF
    nb_tfidf = MultinomialNB()
    nb_tfidf.fit(X_train_vectors_tfidf, y_train)
    y_predict = nb_tfidf.predict(X_val_vectors_tfidf)
    y_prob = nb_tfidf.predict_proba(X_val_vectors_tfidf)[:, 1]

    print("\nNaive Bayes with TF-IDF")
    print(classification_report(y_val, y_predict))
    print("Confusion Matrix:", confusion_matrix(y_val, y_predict))
    fpr, tpr, thresholds = roc_curve(y_val, y_prob)
    roc_auc = auc(fpr, tpr)
    print("AUC:", roc_auc)


    # Logistic Regression with Word2Vec
    lr_w2v = LogisticRegression(solver='liblinear', C=10, penalty='l1')
    lr_w2v.fit(X_train_vectors_w2v, y_train)
    y_predict = lr_w2v.predict(X_val_vectors_w2v)
    y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:, 1]

    print("\nLogistic Regression with Word2Vec")
    print(classification_report(y_val, y_predict))
    print("Confusion Matrix:", confusion_matrix(y_val, y_predict))
    fpr, tpr, thresholds = roc_curve(y_val, y_prob)
    roc_auc = auc(fpr, tpr)
    print("AUC:", roc_auc)


    
    # Logistic Regression Classifier
    lr_w2v = LogisticRegression(solver='liblinear', C=10, penalty='l1')
    lr_w2v.fit(X_train_vectors_w2v, y_train)
    y_predict = lr_w2v.predict(X_val_vectors_w2v)
    y_prob = lr_w2v.predict_proba(X_val_vectors_w2v)[:, 1]
    
    # Evaluation for Logistic Regression
    print("\nLogistic Regression with Word2Vec")
    print(classification_report(y_val, y_predict))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_predict))
    fpr, tpr, thresholds = roc_curve(y_val, y_prob)
    roc_auc = auc(fpr, tpr)
    print("AUC:", roc_auc)



    # Random Forest with Word2Vec
    rf_w2v = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_w2v.fit(X_train_vectors_w2v, y_train)
    y_predict_rf_w2v = rf_w2v.predict(X_val_vectors_w2v)
    y_prob_rf_w2v = rf_w2v.predict_proba(X_val_vectors_w2v)[:, 1]

    # Evaluation for Random Forest with Word2Vec
    print("\nRandom Forest with Word2Vec")
    print(classification_report(y_val, y_predict_rf_w2v))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_predict_rf_w2v))
    fpr_rf_w2v, tpr_rf_w2v, thresholds_rf_w2v = roc_curve(y_val, y_prob_rf_w2v)
    roc_auc_rf_w2v = auc(fpr_rf_w2v, tpr_rf_w2v)
    print("AUC:", roc_auc_rf_w2v)

    # Assuming you have a different feature representation (e.g., TF-IDF or BoW)
    # Random Forest without Word2Vec (using other feature vectors)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_vectors_tfidf, y_train)  # Replace X_train_vectors with your alternative representation
    y_predict_rf = rf.predict(X_val_vectors_tfidf)  # Replace X_val_vectors with your alternative representation
    y_prob_rf = rf.predict_proba(X_val_vectors_tfidf)[:, 1]

    # Evaluation for Random Forest without Word2Vec
    print("\nRandom Forest without Word2Vec")
    print(classification_report(y_val, y_predict_rf))
    print("Confusion Matrix:\n", confusion_matrix(y_val, y_predict_rf))
    fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_val, y_prob_rf)
    roc_auc_rf = auc(fpr_rf, tpr_rf)
    print("AUC:", roc_auc_rf)


analysis_of_cleaned_texts(df_train, 'submission.csv')
print("\n\n\n Second part with ngrams\n\n\n")
analysis_of_cleaned_texts(df_train, 'submission.csv', ngrams=True)





  text = re.sub('\s+', ' ', text)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Lukag\AppData\Roaming\nltk_data...
[nltk_d

FileNotFoundError: [Errno 2] No such file or directory: 'data/MMCoVaR_News_Dataset.csv'

In [3]:
import pandas as pd
from machine_learning_classifiers import TextClassifier
import os


current_dir = os.getcwd()
two_steps_back = os.path.dirname(os.path.dirname(current_dir))
path = os.path.join(two_steps_back, 'data', 'MMCoVaR_News_Dataset.csv')


# ____________________________________________________________
# Load the MMCoVaR News Dataset

df_MMCoVaR = pd.read_csv(path)


# Create a DataFrame for original articles with label 1
df_MMCoVaR_train = df_MMCoVaR[['body_text', 'reliability']].copy()
df_MMCoVaR_train.columns = ['text', 'target']
df_MMCoVaR_train['synthetic'] = False

# --- Running the Machine Learning Pipeline ---

# Instantiate the classifier without n-grams, using default parameters
classifier = TextClassifier(
    df_MMCoVaR_train,
    text_column='text',
    target_column='target',
    use_ngrams=False,
    test_size=0.2,
    random_state=42
)
classifier.run_analysis()

# Optionally, run the analysis with n-grams enabled
print("\n\n\n Running analysis with ngrams enabled \n\n\n")
classifier_ng = TextClassifier(
    df_MMCoVaR_train,
    text_column='text',
    target_column='target',
    use_ngrams=True,
    test_size=0.2,
    random_state=42
)
classifier_ng.run_analysis()


Logistic Regression with TF-IDF
              precision    recall  f1-score   support

           0       0.93      0.80      0.86       190
           1       0.89      0.96      0.93       329

    accuracy                           0.90       519
   macro avg       0.91      0.88      0.89       519
weighted avg       0.91      0.90      0.90       519

Confusion Matrix:
 [[152  38]
 [ 12 317]]
AUC: 0.9608382658774595

Naive Bayes with TF-IDF
              precision    recall  f1-score   support

           0       1.00      0.14      0.25       190
           1       0.67      1.00      0.80       329

    accuracy                           0.69       519
   macro avg       0.83      0.57      0.53       519
weighted avg       0.79      0.69      0.60       519

Confusion Matrix:
 [[ 27 163]
 [  0 329]]
AUC: 0.8852983522636377

Logistic Regression with Word2Vec
              precision    recall  f1-score   support

           0       0.79      0.70      0.74       190
           1 