In [None]:
# Import Libraries
import pandas as pd
import urllib.request
import requests 
import nltk
import re, string, unicodedata
import httplib2
import json
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
def get_URL(url):
  # Read the URL 
  wordlist = []
  for each_text in BeautifulSoup(urllib.request.urlopen(url), from_encoding = urllib.request.urlopen(url).info().get_param('charset')).find_all('a', href = True):
    wordlist.append(each_text['href'])
  return wordlist  # Return all links present in that url

def read_website_to_text(url):
  # HTML Parsing
  soup = BeautifulSoup(requests.get(url).text, 'html5lib')
  for script in soup(["script", "style"]): # kill all script and style elements
    script.extract()  # extract it out 
  return " ".join(re.split(r'[\n\t]+', soup.get_text()))
 
def get_metadata(url):
  # Extract all metadata from URL
  metas = BeautifulSoup(urllib.request.urlopen(url), from_encoding = urllib.request.urlopen(url).info().get_param('charset')).find_all('meta')
  return [meta.attrs['content'] for meta in metas if 'name' in meta.attrs and meta.attrs['name'] == 'description']  # Return all metadata present in that url
 
stop_words = set(stopwords.words('english'))

def sort_coo(coo_matrix): 
  # return a sorted list of tuples with feature name and tf-idf score as its element(in descending order of tf-idf scores).
  return sorted(zip(coo_matrix.col, coo_matrix.data), key = lambda x: (x[1], x[0]), reverse = True)

def pre_processing(text):
  # Pre-processing: Sentence Splitting, Tokenization and Normalization
  # Tokenization
  tokens = [token.strip() for token in word_tokenize(re.sub('[^a-zA-Z0-9_\.!#$%&()*+-./:;<=>?[@]^_`{|}~]', ' ', text))]  # Remove Special Characters & Whitespaces
  process_tokens = [token.lower() for token in tokens if token.lower() not in stop_words] # Removing Stopwords
  process_tokens = [PorterStemmer().stem(word) for word in process_tokens] # Stemming
  process_tokens = [WordNetLemmatizer().lemmatize(word, pos = 'v') for word in process_tokens]  # Lemmatization
  process_tokens = [WordNetLemmatizer().lemmatize(word, pos = 'n') for word in process_tokens]  # Lemmatization
  process_text = ' '.join(process_tokens)
  return process_text

def top_n(feature_names, sorted_items, topn):
  # Get the feature names and tf-idf score of top n items in the corpus, in descending order of scores. 
  score = []
  feature = []
  # word index and corresponding tf-idf score
  for idx, src in sorted_items[:topn]:  
    score.append(round(src, 3))
    feature.append(feature_names[idx])
  lists = {}
  for idx in range(len(feature)):
    lists[feature[idx]] = score[idx]
  return lists

def preprocess(text):
  # Pre-processing: Sentence Splitting, Tokenization and Normalization
  # Tokenization
  tokens = [token.strip() for token in word_tokenize(re.sub('[^a-zA-Z0-9_\.!#$%&()*+-./:;<=>?[@]^_`{|}~]', ' ', text))]  # Remove Special Characters & Whitespaces
  process_tokens = [token.lower() for token in tokens if token.lower() not in stop_words] # Removing Stopwords
  process_tokens = [PorterStemmer().stem(word) for word in process_tokens] # Stemming
  process_tokens = [WordNetLemmatizer().lemmatize(word, pos = 'v') for word in process_tokens]  # Lemmatization
  process_tokens = [WordNetLemmatizer().lemmatize(word, pos = 'n') for word in process_tokens]  # Lemmatization
  print(nltk.pos_tag(process_tokens)) # Part-of-Speech Tagging
  return process_tokens

def tf_idf(text, corpus):
  vectorizer = CountVectorizer(stop_words = stop_words)  # instantiate CountVectorizer()
  # TfidfTransformer to compute the idf
  tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True).fit(vectorizer.fit_transform(corpus))
  # generate tf-idf for the given document
  return top_n(vectorizer.get_feature_names(), sort_coo(tfidf_transformer.transform(vectorizer.transform([text])).tocoo()), 300)

def save_jsonfile(filename, data):
  with open(filename, 'w') as f:
    json.dump(data, f)

def stemming(text):
  # Stemming or Morphological Analysis
  for w in word_tokenize(text):
     return (w, " : ", PorterStemmer().stem(w))

def save_textfile(filename, data):
  # Save data to a text file
  with open(filename, 'w') as filehandle:
    for listitem in data:
      filehandle.write("%s\n" % listitem)

In [None]:
def main():
  urls = ["http://www.multimediaeval.org/mediaeval2019/memorability/", "https://sites.google.com/view/siirh2020/"]
  data = []

  # HTML Parsing and saving all the text in data
  for i, url in enumerate(urls): 
    save_textfile("HTML_Parsed_Document_" + str(i+1), sent_tokenize(read_website_to_text(url)))
    print("HTML_Parsed_Document_" + str(i+1), sent_tokenize(read_website_to_text(url)))
    data.append(read_website_to_text(url))

  print("\n")
  # Saving all the links present in each document
  for i, url in enumerate(urls):
    save_textfile("Links_in_Document_" + str(i+1), get_URL(url))
    print("Links_in_Document_" + str(i+1), get_URL(url))

  print("\n")
  # Saving meta data present in each link
  for i, dat in enumerate(urls):
    save_textfile("Meta_Data_in_link_" + str(i+1), get_metadata(url))
    print("Meta_Data_in_link_" + str(i+1), get_metadata(url))

  print("\n")
  # Saving Pre-processed Data
  for i, dat in enumerate(data):
    print("Part-of-Speech Tagging_" + str(i+1))
    save_textfile("pre-process_in_Document_" + str(i+1), preprocess(dat))
    print("pre-process_in_Document_" + str(i+1), preprocess(dat))
    print("\n")

  # Extracting corpus for calculating tf-idf scores
  corpus = []
  for dat in data:
    corpus.append(pre_processing(dat))

  # Extracting inﬂected forms of a word (Stemming)
  for i, dat in enumerate(corpus):
    save_textfile("Stemming_in_Document_" + str(i+1), stemming(dat))
    print("Stemming_in_Document_" + str(i+1), stemming(dat))

  print("\nMost useful words and phrases for indexing purpose:\n")
  # Saving all the tf-idf scores in each document
  for i, dat in enumerate(corpus):
    save_jsonfile("TF_IDF_scores_in_Document_" + str(i+1)+ ".json", tf_idf(dat, corpus))
    print("TF_IDF_scores_in_Document_" + str(i+1), tf_idf(dat, corpus))

In [None]:
main()

HTML_Parsed_Document_1 [' Media Memorability MediaEval Benchmarking Initiative for Multimedia Evaluation The "multi" in multimedia: speech, audio, visual content, tags, users, context HomeAbout MediaEvalDatasetsMediaEval 2019Emotion & Themes in MusicEyes & Ears TogetherGameStoryLifelogging for wellbeingMedico MultimediaMultimedia RecSysMultimedia SatelliteNo-audio speech turnsPixel PrivacyMedia MemorabilityScene ChangeSports videoMediaEval 2018MediaEval 2017MediaEval 2016MediaEval 2015MediaEval 2014MediaEval 2013MediaEval 2012MediaEval 2011MediaEval 2010VideoCLEF 2009VideoCLEF 2008Videos about MediaEvalWhy Participate?Who are we?Open ScienceAcknowledgments     The 2019 Predicting Media Memorability TaskTask descriptionThis task focuses on the problem of predicting how memorable a video is to viewers.', 'It requires participants to automatically predict memorability scores for videos that reflect the probability a video will be remembered.', 'Task participants are provided with an exten