In [1]:
import nltk
#import sklearn
import string
import pickle	# this is for saving and loading your trained classifiers.
import re 
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.stem import  WordNetLemmatizer
import contractions
from nltk.corpus import stopwords 
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import FreqDist
from nltk.corpus import words
from nltk import NaiveBayesClassifier, ConfusionMatrix



In [2]:
def preprocess(filename):	#filename is of type string. example call: preprocess("philosophy_test.txt")	

	filepath = filename					# you might need to change filepath depending where do you store your data files.
	file = open(filepath, 'r', encoding='utf-8')			# 'r' for read
	lines = file.read().splitlines()	# lines is a list holding each line of your file as strings. e.g. lines = ["This is the 1st line of the file", "This is the 2nd line of the file", ...]
	file.close()

	processed = []		# fill this list with the preprocessed form of the text in the file. you may change to another data structure, if you need. 
						# do not forget to label your documents. 
						# if a document from Class1 becomes "world case speak silent" after basic text processing steps; after labeling, it will look like ("world case speak silent", Class1)
	genre = filename.split("/")[3].split("_")[0]
 
	len_doc = len(lines)
	lemmatizer = WordNetLemmatizer()
	stop_words = set(stopwords.words('english'))
	english_words_set = set(words.words())
	for i in range(0, len_doc, 2):
     
		#-----------------------------remove punctuation-----------------------
		lines[i] = re.sub(r'[\(\)\.\:\!\?\;\,\.\€\-\"\Ã\¦\•\â\Â\”]', ' ',lines[i])
		lines[i+1] = re.sub(r'[\(\)\.\:\!\?\;\,\.\€\-\"\Ã\¦\•\â\Â\”]', ' ',lines[i+1])
		
  		#-----------------------------tokenize---------------------------------
		opened_title = contractions.fix(lines[i]) # he's -> he is
		tokened_title = opened_title.split()
		tokened_title = [word.lower() for word in tokened_title if word.isalnum() and (word.lower() not in stop_words) and (word[0].isupper() or (word.lower() in english_words_set) ) ]
		
		opened_description = contractions.fix(lines[i+1]) # he's -> he is
		tokened_description = opened_description.split()
		tokened_description = [word.lower() for word in tokened_description if word.isalnum() and (word.lower() not in stop_words) and (word[0].isupper() or (word.lower() in english_words_set) ) ]

		#----------------------------- lemmatization---------------------------------
		for word in tokened_title:
			word = lemmatizer.lemmatize(word)
		for word in tokened_description:
			word = lemmatizer.lemmatize(word)
		
		#-----------------------------bag of words---------------------------------
		preprocessed_title = " ".join(tokened_title)
		preprocessed_descr = " ".join(tokened_description)
		processed.append((preprocessed_title + " " + preprocessed_title + " " + " " + preprocessed_descr, genre)) 

	return processed 	# you may change the return value if you need.



In [4]:
######################################################################################################################################################
def create_megadoc(type):
	documents = [f"philosophy_{type}.txt",f"sports_{type}.txt",f"mystery_{type}.txt",f"religion_{type}.txt",f"science_{type}.txt",f"romance_{type}.txt",f"horror_{type}.txt",f"science-fiction_{type}.txt"]
	megadoc = []
	basedir = f"./data/{type}/"

	genre_based_words_dict = dict()
	for filename in documents:
		sentences_for_curr_genre = preprocess(basedir + filename)
		megadoc += sentences_for_curr_genre
		all_sentences, genres = zip(*sentences_for_curr_genre)
		genre_based_words_dict[filename.split("_")[0]] = all_sentences
  
	return megadoc, genre_based_words_dict
	
####################################################################################################################################################


In [5]:
def feature_selection(X_train, y_train, genre_based_words_dict):    
    # ------------------ TF-IDF + chi square ------------------
    tfidf_vectorizer = TfidfVectorizer(max_features=9000)  
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    
    chi2_result = chi2(X_train_tfidf, y_train)
    selected_feature_indices = np.argsort(chi2_result[0])[::-1]
    top_n_features = 1200
    selected_features = set([tfidf_vectorizer.get_feature_names_out()[i] for i in selected_feature_indices[:top_n_features]])
    
    
    # ------------------ FreqDist ------------------ did not work well
    # frequently_used_words = set()
    # for genre, sentences in genre_based_words_dict.items():
    #     all_words = [word for word in " ".join(sentences).split()]
    #     fdist = FreqDist(all_words)
    #     top_n_words = set([w for w, c in fdist.most_common(500)])
    #     frequently_used_words = frequently_used_words.union(top_n_words)
        
    # selected_features = set(selected_features).union(frequently_used_words)
    
    # # ------------------ Anova ------------------ did not work well
    # anova_f = SelectKBest(f_classif, k=600)  
    # anova_f_train = anova_f.fit_transform(X_train_tfidf, y_train)

    # selected_feature_indices = anova_f.get_support(indices=True)
    # feature_names = tfidf_vectorizer.get_feature_names_out()

    # # Get the selected words
    # anova_selected_words = set([feature_names[i] for i in selected_feature_indices])
    # selected_features = selected_features.union(anova_selected_words)
    eliminate_this = {'ice','dr', 'ever','art','ex','ty','eve', 'one', 'us','de', 'thing', 'age', 'han', 'old','young','get'} #these words are obtained from development set
    selected_features = selected_features.difference(eliminate_this)
    
    return list(selected_features)

In [6]:
def extract_features(X_sth, y_sth, selected_features):		# megadoc can be either training_megadoc for training phase or test_megadoc for testing phase.
	extracted_features = list()
	x_and_y = list(zip(X_sth, y_sth))
	for sentence, genre in x_and_y:
		curr_sent_features = dict()
		for feature in selected_features:
			curr_sent_features[feature] = feature in sentence
		extracted_features.append((curr_sent_features,genre))

	return extracted_features

In [7]:
def save_classifier(classifier, filename):	#filename should end with .pickle and type(filename)=string
	with open(filename, "wb") as f:
		pickle.dump(classifier, f)
	return
	
	
def load_classifier(filename):	#filename should end with .pickle and type(filename)=string
	classifier_file = open(filename, "rb")
	classifier = pickle.load(classifier_file)
	classifier_file.close()
	return classifier


In [8]:
def train(classifier, train_features):	# classifier is either nltk.NaiveBayesClassifier or SklearnClassifier(SVC()). Example call: train(SklearnClassifier(SVC()), training_set)
	return classifier.train(train_features)

	
def test(classifier, test_features):	# classifier is either nltk.NaiveBayesClassifier or SklearnClassifier(SVC()). Example call: test(SklearnClassifier(SVC()), test_set)
	features_dict, genres = zip(*test_features)
	y_pred = classifier.classify_many(features_dict)
	genres = list(genres)
	y_pred = list(y_pred)
	confusion_matrix = ConfusionMatrix(genres,y_pred)
	accuracy = nltk.scores.accuracy(genres, y_pred)
	print(f"Accuracy: {accuracy}")
	return confusion_matrix

def examine_confusion(confusion_matrix):
	print(confusion_matrix.pretty_format(sort_by_count=True))
	print(confusion_matrix.evaluate())


In [9]:
#--------------main--------------
train_megadoc, genre_based_words_dict_train = create_megadoc("train")
X_train, y_train = zip(*train_megadoc)
dev_megadoc, genre_based_words_dict_dev = create_megadoc("dev")
X_dev, y_dev = zip(*dev_megadoc)
test_megadoc, genre_based_words_dict_test = create_megadoc("test")
X_test, y_test = zip(*test_megadoc)


In [10]:
print("size train: ", len(train_megadoc))
print("size dev: ", len(dev_megadoc))
print("size test: ", len(test_megadoc))

size train:  6536
size dev:  933
size test:  1865


In [11]:
train_megadoc[0] #example of a document

('republic republic  presented form dialogue socrates three different classic text enquiry notion perfect community ideal individual within conversation raised goodness reality knowledge republic also purpose education role men people remarkable lucidity deft use allegory plato depiction state bound harmony philosopher',
 'philosophy')

In [12]:
selected_fatures = feature_selection(X_train, y_train, genre_based_words_dict_train)

In [14]:
len(selected_fatures)

1184

In [15]:
train_features = extract_features(X_train, y_train, selected_fatures)
dev_features = extract_features(X_dev,y_dev, selected_fatures)
test_features = extract_features(X_test, y_test, selected_fatures)

In [16]:
nbc_classifier = train(NaiveBayesClassifier,train_features)

save_classifier(nbc_classifier, "naive_bayes.pickle")

nbc_confusion_matrix = test(nbc_classifier,dev_features)

examine_confusion(nbc_confusion_matrix)

Accuracy: 0.7341907824222936
                |     s                   |
                |     c                   |
                |     i                   |
                |     e                   |
                |     n                   |
                |     c              p    |
                |     e              h    |
                |     -        r     i    |
                |  m  f        e  s  l  r |
                |  y  i  h  s  l  c  o  o |
                |  s  c  o  p  i  i  s  m |
                |  t  t  r  o  g  e  o  a |
                |  e  i  r  r  i  n  p  n |
                |  r  o  o  t  o  c  h  c |
                |  y  n  r  s  n  e  y  e |
----------------+-------------------------+
        mystery |<91> 3 14  1  .  .  . 11 |
science-fiction |  4<85>19  2  1  2  .  7 |
         horror | 14  8<82> 1  .  .  1 12 |
         sports |  .  .  1<99> 1  .  . 16 |
       religion |  2  5  6  .<78> 3 18  3 |
        science |  2 14  8  .  2<85> 4  . |
   

In [79]:
nbc_test_cm = test(nbc_classifier,test_features)
examine_confusion(nbc_test_cm)

Accuracy: 0.7190348525469169
                |       s                         |
                |       c                         |
                |       i                         |
                |       e                         |
                |       n                         |
                |       c                   p     |
                |       e                   h     |
                |       -           r       i     |
                |   m   f           e   s   l   r |
                |   y   i   s   h   l   c   o   o |
                |   s   c   p   o   i   i   s   m |
                |   t   t   o   r   g   e   o   a |
                |   e   i   r   r   i   n   p   n |
                |   r   o   t   o   o   c   h   c |
                |   y   n   s   r   n   e   y   e |
----------------+---------------------------------+
        mystery |<178>  6   .  36   3   2   .  15 |
science-fiction |   9<153>  .  45   3   8   4  18 |
         sports |   1   5<181>  7  

## Test features examination

In [81]:
features_dict, genres = zip(*test_features)
size_genres = len(genres)
horror_dict = dict()
mys_dict = dict()
philosophy_dict = dict()
for i in range(size_genres):
    if genres[i] == "horror":
        curr_dict = features_dict[i]
        for key in curr_dict.keys():
            if curr_dict[key] == True:
                if key in horror_dict.keys():
                    horror_dict[key] += 1
                else:
                    horror_dict[key] = 1
                    
    if genres[i] == "mystery":
        curr_dict = features_dict[i]
        for key in curr_dict.keys():
            if curr_dict[key] == True:
                if key in mys_dict.keys():
                    mys_dict[key] += 1
                else:
                    mys_dict[key] = 1
                    
    if genres[i] == "philosophy":
        curr_dict = features_dict[i]
        for key in curr_dict.keys():
            if curr_dict[key] == True:
                if key in philosophy_dict.keys():
                    philosophy_dict[key] += 1
                else:
                    philosophy_dict[key] = 1

In [82]:
sorted_dict_mys = sorted(mys_dict.items(), key=lambda x: x[1])
sorted_dict_horro = sorted(horror_dict.items(), key=lambda x: x[1])
sorted_dict_philo = sorted(philosophy_dict.items(), key=lambda x: x[1])

In [88]:
print(list(sorted_dict_mys[-10:]))

[('found', 57), ('ring', 59), ('win', 60), ('death', 61), ('dead', 64), ('life', 64), ('king', 67), ('kill', 68), ('murder', 73), ('pro', 74)]


In [89]:
print(list(sorted_dict_horro[-10:]))

[('ali', 53), ('dead', 54), ('pro', 56), ('night', 56), ('evil', 57), ('life', 59), ('dark', 60), ('war', 60), ('world', 63), ('king', 71)]


In [90]:
print(list(sorted_dict_philo[-10:]) )

[('king', 61), ('vol', 62), ('life', 75), ('world', 76), ('philosophy', 82), ('du', 95), ('ali', 97), ('book', 97), ('work', 102), ('pro', 112)]


In [86]:
svc_classifier = train(SklearnClassifier(SVC(kernel='rbf')),train_features)
save_classifier(svc_classifier, "svc_classifier.pickle")
svc_confusion_matrix = test(svc_classifier,dev_features)
examine_confusion(svc_confusion_matrix)


Accuracy: 0.7063236870310825
                |     s                   |
                |     c                   |
                |     i                   |
                |     e                   |
                |     n                   |
                |     c              p    |
                |     e              h    |
                |     -        r     i    |
                |  m  f        e  s  l  r |
                |  y  i  h  s  l  c  o  o |
                |  s  c  o  p  i  i  s  m |
                |  t  t  r  o  g  e  o  a |
                |  e  i  r  r  i  n  p  n |
                |  r  o  o  t  o  c  h  c |
                |  y  n  r  s  n  e  y  e |
----------------+-------------------------+
        mystery |<86> 6 16  1  .  .  . 11 |
science-fiction |  7<85>17  1  1  2  .  7 |
         horror | 15 16<73> 3  1  .  2  8 |
         sports |  .  .  1<99> 1  .  1 15 |
       religion |  2  4  4  .<81> 6 14  4 |
        science |  . 11  4  2  5<88> 4  1 |
   

In [87]:
svc_confusion_matrix_test = test(svc_classifier,test_features)
examine_confusion(svc_confusion_matrix_test)

Accuracy: 0.7045576407506703
                |       s                         |
                |       c                         |
                |       i                         |
                |       e                         |
                |       n                         |
                |       c                   p     |
                |       e                   h     |
                |       -           r       i     |
                |   m   f           e   s   l   r |
                |   y   i   s   h   l   c   o   o |
                |   s   c   p   o   i   i   s   m |
                |   t   t   o   r   g   e   o   a |
                |   e   i   r   r   i   n   p   n |
                |   r   o   t   o   o   c   h   c |
                |   y   n   s   r   n   e   y   e |
----------------+---------------------------------+
        mystery |<174> 11   3  33   4   1   .  14 |
science-fiction |  11<157>  .  41   3   6   4  18 |
         sports |   1   5<184>  4  