<a href="https://colab.research.google.com/github/em-chiu/intersection_project/blob/main/EC%20Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import urllib.request
from nltk.corpus import stopwords
import re
import numpy as np
from collections import Counter
from sklearn import preprocessing
import string
nltk.download('averaged_perceptron_tagger') # to solve pos feature extract issue
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# load the data
non_clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/non_clickbait_data.txt"
clickbait_url = "http://www.cs.columbia.edu/~sarahita/CL/clickbait_data.txt"

# read url .txt file into string "data"
def get_data(url):
  data = urllib.request.urlopen(url).read().decode('utf-8')
  return data

non_clickbait_data = get_data(non_clickbait_url)
clickbait_data = get_data(clickbait_url)

In [None]:
# combine clickbait and non-clickbait data in a single list
non_clickbait_headlines = non_clickbait_data.rstrip('\n').split('\n')
clickbait_headlines = clickbait_data.rstrip('\n').split('\n')
all_headlines = non_clickbait_headlines + clickbait_headlines

In [None]:
# create a list of corresponding labels
non_cb_labels = [0] * len(non_clickbait_headlines)
cb_labels = [1] * len(clickbait_headlines)
all_labels = non_cb_labels + cb_labels

In [None]:
# extract features: bag of stop words
def stop_words(texts):
  bow = [] 
  eng_stopwords = stopwords.words('english')
  for text in texts:      
    counts = []
    tokens = nltk.word_tokenize(text.lower())
    for sw in eng_stopwords:
      sw_count = tokens.count(sw)
      counts.append(sw_count)
    bow.append(counts)
  bow_np = np.array(bow).astype(float)
  return bow_np

In [None]:
# extract features
stop_words_features = stop_words(all_headlines)



In [None]:
stop_words_features.shape

(31998, 179)

In [None]:
# convert features and labels to numpy arrays
X = stop_words_features # count vectorizer
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.8735535323538606


In [None]:
# extract features: POS tags
def POS_tags(texts):
  bow = [] # bag of word list initialized
  POS = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS','CC','PRP','VB','VBG']
  for text in texts:      
      counts = []
      tokens = nltk.word_tokenize(text.lower()) # tokenized text
      tagged_words = nltk.pos_tag(tokens) # tag tokens, list of tuples
      pos_tags = [x[1] for x in tagged_words] # 2nd element of tuple- POS of the tokens in list
      # print(tokens,'\n', tagged_words, '\n', pos_tags, '\n', POS) # look at variables to differentiate
      for pos in POS:
          pos_count = pos_tags.count(pos)
          counts.append(pos_count)
      bow.append(counts)
  bow_np = np.array(bow).astype(float)
  return bow_np

In [None]:
# extract features
pos_features = POS_tags(all_headlines)

In [None]:
pos_features.shape

(31998, 10)

In [None]:
# convert features and labels to numpy arrays
X = pos_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.7672044584245077


In [None]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk import ngrams

In [None]:
# extract features: unigrams
def unigrams_lex(texts):
  unigrams = []
  stop = set(stopwords.words('english'))
  for headline in texts:
    filtered_headlines = []
    tokens = nltk.word_tokenize(headline)
    filtered = [token for token in tokens if not (token in stop or token in string.punctuation)]
    for token in tokens:
        filtered_headlines.append(headline)
        counter_obj = Counter(filtered_headlines)
        top_thirty = counter_obj.most_common(30)
        counts_thirty = [count[1] for count in top_thirty] # takes second element in tuple
    unigrams.append(counts_thirty) # desired: counts for 30 most common unigrams in entire corpus (remove stopwords and punctuation for unigram count)
  unigrams_np = np.array(unigrams, dtype=object)
  return unigrams_np

In [None]:
unigrams_features = unigrams_lex(all_headlines)

In [None]:
unigrams_features.shape

(31998, 1)

In [None]:
# convert features and labels to numpy arrays
X = unigrams_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.49996874023132226


In [None]:
unigram_features = unigrams_lex(all_headlines)

In [None]:
unigram_features.shape

(31998, 1)

In [None]:
# extract features: punctuation mark in string.punctuation
def count_puncs(texts):
  bow = []
  punctuation = string.punctuation
  for text in texts:      
      counts = []
      tokens = nltk.word_tokenize(text.lower())
      for punc in punctuation:
          punc_count = tokens.count(punc)
          counts.append(punc_count)
      bow.append(counts)
  bow_np = np.array(bow).astype(float) #converting bow list to numpy array list, converting to float
  return bow_np

In [None]:
# extract features
punc_features = count_puncs(all_headlines)

In [None]:
punc_features.shape

(31998, 32)

In [None]:
# convert features and labels to numpy arrays
X = punc_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.5012524812441388


In [None]:
# change list to string
headlines_str = '\n'.join(all_headlines) # joining and puts on separate lines
#space makes one big line of headlines

In [None]:
# use tokenized headlines instead of tokenizing each time
tokenized_headlines = nltk.word_tokenize(headlines_str.lower())

In [None]:
def get_avg_char_per_word(headlines):
  total_length = sum(len(word) for word in headlines) #for word in headline.split())
  num_words = len(tokenized_headlines)
  return total_length/num_words

In [None]:
def get_uniquetotal_words(headline):
  num_unique_words = len(set(headline)) # one headline at a time to get one unique number at the time of iteration
  total_words = len(headline)
  return num_unique_words/total_words

In [None]:
def get_num_words(headline):
  return len(headline)

In [None]:
def get_long_words(headline): # creating new item for list
  for word in headline:
    long_count = 0 # initialize w/ 0 for counting
    if len(word) >= 6:
      long_count += 1 #treating as a variable (answer is number, not list)
  return long_count

In [None]:
def get_long_words(headline): # creating new item for list
  long_count = 0 # initialize w/ 0 for counting
  for word in headline:
    if len(word) >= 6:
      long_count += 1 #treating as a variable (answer is number, not list)
  return long_count

In [None]:
get_long_words(all_headlines)

31998

In [None]:
# extract features: complexity
def complexity(headlines): # passing headlines, list of str (headlines)
  complexity = [] # keeps track of list of headlines
  for headline in headlines:
    headline_features = [] # stores features for array- list affected by for loop, needed to append function values
    tokens = nltk.word_tokenize(headline.lower()) #one headline
    headline_features.append(get_avg_char_per_word(tokens)) #average character per word
    headline_features.append(get_uniquetotal_words(tokens)) # unique/total words
    headline_features.append(len(tokens)) # number of words
    headline_features.append(get_long_words(tokens)) #long words
    complexity.append(headline_features)
  complexity_np = np.array(complexity).astype(float) # array = list of lists    
  return complexity_np

    # looking for 4 complexity features, go through/for every headline, extract 4 features- then save them
#4 subfunctions, 1 for each complexity measure 
#pass a single headline to the function and return a number

In [None]:
complexity_features = complexity(all_headlines)

In [None]:
complexity_features.shape

(31998, 4)

In [None]:
# convert features and labels to numpy arrays
X = complexity_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.5893177360112536


In [None]:
# extract own features
def crypto_nft(texts):
  bow = []
  currency = ['crypto', 'nft', 'cryptocurrency', 'bitcoin', 'nfts', 'non-fungible','ethereum','blockchain', 'instant', 'profit', 'profits', 'money', 'rich', 'tips', 'investing']
  for text in texts:      
      counts = []
      tokens = nltk.word_tokenize(text.lower())
      for crypto in currency:
          crypto_count = tokens.count(crypto)
          counts.append(crypto_count)
      bow.append(counts)
  bow_np = np.array(bow).astype(float)
  return bow_np

In [None]:
# extract features
crypto_nft_features = crypto_nft(all_headlines)

In [None]:
crypto_nft_features.shape

(31998, 15)

In [None]:
# convert features and labels to numpy arrays
X = crypto_nft_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.5011876465301657


In [None]:
# extract all features
# concatenate all features
a = stop_words_features
b = pos_features
c = unigrams_features
d = punc_features
e = complexity_features
f = crypto_nft_features


In [None]:
# args = (a, b, c)
# all_features = np.concatenate(args) # asks dimensions to be the same, 3 args limit

In [None]:
all_features = np.column_stack((a, b, c, d, e, f))

In [None]:
# convert features and labels to numpy arrays
X = all_features
Y = np.array(all_labels)

# run classifier using 10-fold cross validation
# report mean accuracy 

scores = cross_val_score(MultinomialNB(), X, Y, scoring='accuracy', cv=10)
print(scores.mean())

0.8893370096123789
