PROBLEM 1

In [27]:
import numpy as np

# Function to read data from a file and return a list of lines
def read_file(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return lines

# Read data from "clickbait.txt" and "not-clickbait.txt"
clickbait_data = read_file("clickbait.txt")
not_clickbait_data = read_file("not-clickbait.txt")

# Combine the two datasets into a single list
combined_data = clickbait_data + not_clickbait_data

# Shuffle the combined dataset using numpy.random.shuffle
np.random.shuffle(combined_data)

In [28]:
# Define the proportions for train, validation, and test datasets
train_ratio = 0.72
validation_ratio = 0.08
test_ratio = 0.20

# Calculate the number of samples for each dataset
total_samples = len(combined_data)
num_train = int(total_samples * train_ratio)
num_validation = int(total_samples * validation_ratio)
num_test = total_samples - num_train - num_validation

# Split the combined_data into train, validation, and test datasets
train_data = combined_data[:num_train]
validation_data = combined_data[num_train:num_train + num_validation]
test_data = combined_data[num_train + num_validation:]


In [29]:
# Convert the "clickbait_data" list to a set for faster membership checking
clickbait_data_set = set(clickbait_data)

# Define a function to calculate the percentage of samples from "clickbait_data" in a dataset
def calculate_clickbait_data_percentage(dataset):
    # Count the number of samples in the dataset that are also in "clickbait_data"
    dataset_set = set(dataset)
    common_samples = dataset_set.intersection(clickbait_data_set)
    # Calculate the percentage of samples from "clickbait_data" in the dataset
    clickbait_data_percentage = (len(common_samples) / len(dataset_set)) * 100
    return clickbait_data_percentage

# Calculate the percentage of samples from "clickbait_data" in each dataset
train_clickbait_data_percentage = calculate_clickbait_data_percentage(train_data)
validation_clickbait_data_percentage = calculate_clickbait_data_percentage(validation_data)
test_clickbait_data_percentage = calculate_clickbait_data_percentage(test_data)

# Print the percentages
print(f"Percentage of Samples from 'clickbait_data' in Train Dataset: {train_clickbait_data_percentage:.2f}%")
print(f"Percentage of Samples from 'clickbait_data' in Validation Dataset: {validation_clickbait_data_percentage:.2f}%")
print(f"Percentage of Samples from 'clickbait_data' in Test Dataset: {test_clickbait_data_percentage:.2f}%")


Percentage of Samples from 'clickbait_data' in Train Dataset: 34.58%
Percentage of Samples from 'clickbait_data' in Validation Dataset: 35.60%
Percentage of Samples from 'clickbait_data' in Test Dataset: 31.80%


In [30]:
import pandas as pd
df_train=pd.DataFrame(train_data)
df_train['clk']=0
df_train['clk'] = df_train[0].apply(lambda x: 1 if x in clickbait_data_set else 0)

df_test=pd.DataFrame(test_data)
df_test['clk']=0
df_test['clk'] = df_test[0].apply(lambda x: 1 if x in clickbait_data_set else 0)

df_validation=pd.DataFrame(validation_data)
df_validation['clk']=0
df_validation['clk'] = df_validation[0].apply(lambda x: 1 if x in clickbait_data_set else 0)

PROBLEM 3

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Sample data
X = df_train[0]
y = df_train['clk']

# Create a scikit-learn pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1, 2))),  # Include unigrams and bigrams
    ('classifier', MultinomialNB())
])

# Fit the pipeline on the training set
pipeline.fit(X, y)


Pipeline(steps=[('vectorizer', CountVectorizer(ngram_range=(1, 2))),
                ('classifier', MultinomialNB())])

In [32]:
from sklearn.metrics import precision_score, recall_score, f1_score
# Make predictions on the training and validation sets
y_train_pred = pipeline.predict(df_train[0])
y_valid_pred = pipeline.predict(df_validation[0])

# Compute precision, recall, and F1-score for training set
precision_train = precision_score(df_train['clk'], y_train_pred, average='binary', pos_label=1)
recall_train = recall_score(df_train['clk'], y_train_pred, average='binary', pos_label=1)
f1_train = f1_score(df_train['clk'], y_train_pred, average='binary', pos_label=1)

# Compute precision, recall, and F1-score for validation set
precision_valid = precision_score(df_validation['clk'], y_valid_pred, average='binary', pos_label=1)
recall_valid = recall_score(df_validation['clk'], y_valid_pred, average='binary', pos_label=1)
f1_valid = f1_score(df_validation['clk'], y_valid_pred, average='binary', pos_label=1)

# Print the results
print(f"Training Precision: {precision_train:.2f}")
print(f"Training Recall: {recall_train:.2f}")
print(f"Training F1-Score: {f1_train:.2f}")

print(f"Validation Precision: {precision_valid:.2f}")
print(f"Validation Recall: {recall_valid:.2f}")
print(f"Validation F1-Score: {f1_valid:.2f}")

Training Precision: 1.00
Training Recall: 1.00
Training F1-Score: 1.00
Validation Precision: 0.88
Validation Recall: 0.87
Validation F1-Score: 0.87


PROBLEM 4

In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score



# Define a parameter grid with varying hyperparameters
param_grid = {
    'vectorizer__max_df': [0.7, 0.8, 0.9],  # Vary max_df for CountVectorizer
    'classifier__alpha': [0.1, 0.5, 1.0],  # Vary alpha for MultinomialNB
    'vectorizer__ngram_range': [(1, 1), (1, 2)]  # Vary ngram_range for CountVectorizer
}

# Initialize an empty dictionary to store results
results = {}

# Iterate over parameter combinations
for params in ParameterGrid(param_grid):
    # Create a scikit-learn pipeline with specified hyperparameters
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(max_df=params['vectorizer__max_df'], ngram_range=params['vectorizer__ngram_range'])),
        ('classifier', MultinomialNB(alpha=params['classifier__alpha']))
    ])
    
    X = df_train[0]
    y = df_train['clk']
    # Fit the entire pipeline on the training set
    pipeline.fit(X, y)
    
    # Make predictions on the validation set
    y_valid_pred = pipeline.predict(df_validation[0])
    
    # Calculate precision, recall, and F1-score for the validation set
    precision = precision_score(df_validation['clk'], y_valid_pred)
    recall = recall_score(df_validation['clk'], y_valid_pred)
    f1 = f1_score(df_validation['clk'], y_valid_pred)
    
    # Store the results in the dictionary
    results[str(params)] = {'Precision': precision, 'Recall': recall, 'F1-Score': f1}

# Print the results, sorting by F1-score
sorted_results = sorted(results.items(), key=lambda x: x[1]['F1-Score'], reverse=True)
for i, (params, metrics) in enumerate(sorted_results):
    if i < 5 or i >= len(sorted_results) - 5:
        print(f"Parameters: {params}")
        print(f"Precision: {metrics['Precision']:.2f}")
        print(f"Recall: {metrics['Recall']:.2f}")
        print(f"F1-Score: {metrics['F1-Score']:.2f}")
        print()


Parameters: {'classifier__alpha': 0.5, 'vectorizer__max_df': 0.7, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.87
Recall: 0.90
F1-Score: 0.88

Parameters: {'classifier__alpha': 0.5, 'vectorizer__max_df': 0.8, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.87
Recall: 0.90
F1-Score: 0.88

Parameters: {'classifier__alpha': 0.5, 'vectorizer__max_df': 0.9, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.87
Recall: 0.90
F1-Score: 0.88

Parameters: {'classifier__alpha': 1.0, 'vectorizer__max_df': 0.7, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.87
Recall: 0.90
F1-Score: 0.88

Parameters: {'classifier__alpha': 1.0, 'vectorizer__max_df': 0.8, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.87
Recall: 0.90
F1-Score: 0.88

Parameters: {'classifier__alpha': 0.1, 'vectorizer__max_df': 0.8, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.86
Recall: 0.84
F1-Score: 0.85

Parameters: {'classifier__alpha': 0.1, 'vectorizer__max_df': 0.9, 'vectorizer__ngram_range': (1, 1)}
Precision: 0.86
Recall:

PROBLEM 5

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score, f1_score


# Define a parameter grid with varying hyperparameters
param_grid = {
    'vectorizer__max_df': [0.7, 0.8, 0.9],  # Vary max_df for CountVectorizer
    'classifier__alpha': [0.1, 0.5, 1.0],  # Vary alpha for MultinomialNB
    'vectorizer__ngram_range': [(1, 1), (1, 2)]  # Vary ngram_range for CountVectorizer
}

# Initialize variables to keep track of the best model
best_model = None
best_f1_train = 0.0

# Iterate over parameter combinations
for params in ParameterGrid(param_grid):
    # Create a scikit-learn pipeline with specified hyperparameters
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(max_df=params['vectorizer__max_df'], ngram_range=params['vectorizer__ngram_range'])),
        ('classifier', MultinomialNB(alpha=params['classifier__alpha']))
    ])
    
    X = df_train[0]
    y = df_train['clk']
    # Fit the entire pipeline on the training set
    pipeline.fit(X, y)
    
    y_train_pred = pipeline.predict(df_train[0])
    
    
    # Calculate F1-score for the training set
    f1_train = f1_score(df_train['clk'], y_train_pred)
    
    # If the current model has a higher F1-score, update the best model
    if f1_train > best_f1_train:
        best_f1_train = f1_train
        best_model = pipeline

# Print the best model's hyperparameters and F1-score on the training set
print("Best Model Hyperparameters:")
print(best_model.named_steps['vectorizer'].get_params())
print(best_model.named_steps['classifier'].get_params())
print(f"Best F1-Score on Training Set: {best_f1_train:.2f}")

# Apply the best model to the test set
y_test_pred = best_model.predict(df_test[0])

# Compute precision, recall, and F1-score for the test set
precision_test = precision_score(df_test['clk'], y_test_pred)
recall_test = recall_score(df_test['clk'], y_test_pred)
f1_test = f1_score(df_test['clk'], y_test_pred)

# Print the results on the test set
print(f"\nPrecision on Test Set: {precision_test:.2f}")
print(f"Recall on Test Set: {recall_test:.2f}")
print(f"F1-Score on Test Set: {f1_test:.2f}")


Best Model Hyperparameters:
{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.int64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 0.7, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 2), 'preprocessor': None, 'stop_words': None, 'strip_accents': None, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'vocabulary': None}
{'alpha': 0.1, 'class_prior': None, 'fit_prior': True}
Best F1-Score on Training Set: 1.00

Precision on Test Set: 0.88
Recall on Test Set: 0.88
F1-Score on Test Set: 0.88


PROBLEM 6

In [38]:
# Assuming 'best_model' contains your selected Multinomial Naive Bayes model
classifier = best_model.named_steps['classifier']
vectorizer = best_model.named_steps['vectorizer']

# Get feature log probabilities
feature_log_probs = classifier.feature_log_prob_

# Get the feature names (words)
feature_names = vectorizer.get_feature_names()

# Filter out bigrams
unigrams = [word for word in feature_names if ' ' not in word]

# Create a dictionary mapping feature names to their log probabilities
word_log_probs = dict(zip(feature_names, feature_log_probs[1]))  # Use index 1 for clickbait class

# Sort the words based on log probabilities in descending order
sorted_words = sorted(unigrams, key=lambda word: word_log_probs.get(word, 0), reverse=True)

# Display the top 5 words
top_5_words = sorted_words[:5]
print("Top 5 Clickbait Indicators (Unigrams Only):")
for word in top_5_words:
    print(word)


Top 5 Clickbait Indicators (Unigrams Only):
the
you
to
this
is


PROBLEM 7

In [39]:
import re

# Assuming 'top_5_words' contains your top 5 keywords
top_5_keywords_pattern = r'\b(?:' + '|'.join(re.escape(word) for word in top_5_words) + r')\b'

# Function to apply the regular expression to a text
def detect_clickbait(text):
    return bool(re.search(top_5_keywords_pattern, text, flags=re.IGNORECASE))

# Apply the function to the test set
y_test_pred_regex = df_test[0].apply(detect_clickbait)

# Compute precision, recall, and F1-score for the test set
precision_regex = precision_score(df_test['clk'], y_test_pred_regex)
recall_regex = recall_score(df_test['clk'], y_test_pred_regex)
f1_regex = f1_score(df_test['clk'], y_test_pred_regex)

# Print the results
print(f"Precision using Regex: {precision_regex:.2f}")
print(f"Recall using Regex: {recall_regex:.2f}")
print(f"F1-Score using Regex: {f1_regex:.2f}")


Precision using Regex: 0.44
Recall using Regex: 0.79
F1-Score using Regex: 0.56
