## Homework #0 - CSE 572 Fall '23
### Tyler Fichiera

#### Provided code for tokenizing the article text

In [14]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

# stemming tool from nltk
stemmer = PorterStemmer()

# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def get_tokens(text):
  # turn document into lowercase
  lowers = text.lower()
  # remove punctuations
  no_punctuation = lowers.translate(remove_punctuation_map)
  # tokenize document
  tokens = nltk.word_tokenize(no_punctuation)
  # remove stop words
  filtered = [w for w in tokens if not w in stopwords.words('english')]
  # stemming process
  stemmed = []
  for item in filtered:
    stemmed.append(stemmer.stem(item))
  # final unigrams
  return stemmed

#### Load the dictionary from the given text file

In [15]:
with open('dictionary.txt', 'r') as dictionary_file:
    dictionary = [line.strip() for line in dictionary_file.readlines()]

#### Load news articles from given csv file

In [16]:
import csv, math, json
from collections import defaultdict

article_categories = {}

unigram_occurrences_by_article = {}

dictionary_word_article_occcurences = defaultdict(int)

N = 0

with open('news-train-1.csv', 'r') as csv_file:
  csv_reader = csv.reader(csv_file)

  next(csv_reader) # skip header row

  for article_id, article_text, article_category in csv_reader:
    N += 1

    article_unigrams = get_tokens(article_text)
    article_categories[article_id] = article_category

    unigram_occurences = {}
    for word in dictionary:
      num_of_occurences = article_unigrams.count(word)

      if num_of_occurences > 0:
        dictionary_word_article_occcurences[word] += 1

      unigram_occurences[word] = num_of_occurences

    unigram_occurrences_by_article[article_id] = unigram_occurences

#### Calculate desired data and output to txt/json files

In [17]:
category_word_frequencies = defaultdict(lambda: defaultdict(int))
category_word_tfidf_scores = defaultdict(lambda: defaultdict(list))

TFIDF_matrix = []

for article_id in unigram_occurrences_by_article:
  matrix_row = []

  article_unigrams = unigram_occurrences_by_article[article_id]
  max_occurences = max(unigram_occurrences_by_article[article_id].values())

  for word in dictionary:
    TF_ij = article_unigrams[word] / max_occurences
    IDF_j = math.log(N / dictionary_word_article_occcurences[word])
    TFIDF_ij = TF_ij * IDF_j
    
    matrix_row.append(TFIDF_ij)

    article_category = article_categories[article_id]
    category_word_tfidf_scores[article_category][word].append(TFIDF_ij)
    category_word_frequencies[article_category][word] += article_unigrams[word]

  TFIDF_matrix.append(matrix_row)

with open('matrix.txt', 'w') as file:
  for row in TFIDF_matrix:
    row_str = ','.join(map(str, row))
    file.write(row_str + '\n')

top_words_by_category_frequency = {}
top_words_by_category_scores = {}

for category, word_frequencies in category_word_frequencies.items():
  sorted_words = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
  top_words_frequency = {word: frequency for word, frequency in sorted_words[:3]}
  top_words_by_category_frequency[category] = top_words_frequency

for category, word_tfidf_scores in category_word_tfidf_scores.items():
  sorted_words = sorted(word_tfidf_scores.items(), key=lambda x: sum(x[1]) / len(x[1]), reverse=True)
  top_words_scores = {word: sum(scores) / len(scores) for word, scores in sorted_words[:3]}
  top_words_by_category_scores[category] = top_words_scores

with open('frequency.json', 'w') as json_file:
  json.dump(top_words_by_category_frequency, json_file, indent=4)

with open('scores.json', 'w') as json_file:
  json.dump(top_words_by_category_scores, json_file, indent=4)