In [120]:
import os
import numpy as np
import gensim
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import string
import pandas as pd
import sklearn

In [303]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

def parse_tags(tags):
    result = []
    for doc in tags:
        doc = doc.strip('\n').split('\n')
        cat_it = []
        if doc[0] == '':
            result.append('no tag')
        else:
            for tag in doc:
                split_tag = tag.split(':')
                cat_it.append(split_tag[0])
                cat_it.append(split_tag[1])
            parsed = (' ').join(list(set(cat_it)))
            result.append(parsed)
    return np.array(result)

In [304]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct
        
        data[i] = data[i].lower()

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(set(word_tokens))
        
    return data

# CREATE THE BAG OF WORDS DICTIONARY
def create_bow(strings):
    BOW = {}
    for description in strings:
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            BOW[word] = 0
    BOW['null'] = 0
    return BOW

def create_bow_vectors(data, BOW):
    feature_vectors = []
    for description in data:
        feat_vec = BOW.copy()
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            if word in feat_vec:
                feat_vec[word] += 1   
        feature_vectors.append(feat_vec)

    # TURN DICTIONARIES INTO A MATRIX with each row as one description
    feature_vector_matrix = []
    for feature_vec in feature_vectors:
        feature_vector_matrix.append(list(feature_vec.values()))

    # NORMALIZE THE FEATURES
    feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm
    return feature_vector_matrix

In [305]:
train_descs = parse_descriptions('cs5785-fall19-final/descriptions_train', 10000)
test_descs  = parse_descriptions('cs5785-fall19-final/descriptions_test', 2000)
train_descs = preprocessing(train_descs)
test_descs  = preprocessing(test_descs)
train_tags = parse_tags(parse_descriptions("cs5785-fall19-final/tags_train", num_doc=10000))
test_tags = parse_tags(parse_descriptions("cs5785-fall19-final/tags_test", num_doc=2000))

In [306]:
BoW = create_bow(np.array(list(train_descs)+list(test_descs)))
train_desc_vecs = create_bow_vectors(train_descs, BoW)
test_desc_vecs = create_bow_vectors(test_descs, BoW) 
train_tag_vecs = create_bow_vectors(train_tags, BoW)
test_tag_vects = create_bow_vectors(test_tags,BoW)

In [364]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain, ytest = train_test_split(train_desc_vecs, train_tag_vecs)

In [365]:
s = sklearn.metrics.pairwise_distances(xtest, ytest)

In [366]:
best = []
for desc in s:
    best.append(np.argsort(desc)[:20])

In [367]:
def scores(i):
    return (20+1-(i+1))/20

def eval_accuracy(neighbors):
    # EVALUATE THE MODEL USING THE MEAN AVERAGE PRECISION AT 20
    scs = []
    for i in range(len(neighbors)):
        good = False
        for j, n in enumerate(neighbors[i]):
            if i == n:
                scs.append(scores(j))
                good = True
                break
        if good == False:
            scs.append(0)
    return sum(scs)/len(neighbors)

In [368]:
eval_accuracy(best)

0.3406599999999997