## Classification with KNN

In this lecture we will use KNN to classify the NYT article and compare the results with Naive Bayes

## Loading and preprocessing articles

In [None]:
import operator
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Define text preprocessing function

In [3]:
def remove_punctuation(text):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filtered_text = [token for token in text if (not token in punctuations)]
    return filtered_text

def apply_stopwording(text, min_len):
    filtered_text = [token for token in text if (not token in stopwords.words('english') and len(token)>min_len)]
    return filtered_text

def apply_stemming(text):
    stemmer = nltk.PorterStemmer()
    normalized_text = [stemmer.stem(token) for token in text]
    return normalized_text

def apply_lemmatization(text):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_text = [lemmatizer.lemmatize(token) for token in text]
    return normalized_text

Load and preprocess the articles in the training set

In [5]:
in_path = 'C:\\tmp\\'
filePrefix = 'training_'
categories=['ARTS','SPORTS']
training = {}

for category in categories:
    index=1
    # Read articles within category
    fileName=in_path+filePrefix+category.lower()
    f=open(fileName,'r')
    lines=f.readlines()
    for line in lines:
        text = line.replace('\n',' ').lower()
        tokens = nltk.word_tokenize(text)
        training[str(apply_lemmatization(apply_stopwording(remove_punctuation(nltk.Text(tokens)), 3)))]=category
    f.close()

for key in training.keys():
    print (key)

['excerpt', 'interview', 'rapper', 'kendrick', 'lamar', 'discus', 'critically', 'acclaimed', 'album', 'pimp', 'butterfly', 'recent', 'grammy', 'nomination']
['york', 'time', 'critic', 'manohla', 'dargis', 'scott', 'stephen', 'holden', 'list', 'pick', 'oscar-worthy', 'film', 'performance', '2015']
['caramanica', 'profile', 'blogger', 'stephen', 'carbone', 'posting', 'spoiler', 'reality', 'show', 'bachelor', 'past', 'four', 'year', 'website', 'realitysteve.com', 'reflects', 'show', 'begin', '20th', 'season', 'carbone', 'continues', 'information', 'source']
['adam', 'mckay', 'director', 'co-writer', 'caper', 'movie', 'short', 'discus', 'used', 'humor', 'explain', 'complicated', 'aspect', '2008', 'housing', 'banking', 'crisis']
['zachary', 'woolfe', 'offer', 'highlight', 'season', 'amazon', 'original', 'series', 'mozart', 'jungle', 'starring', 'gael', 'garcia', 'bernal', 'show', 'feature', 'behind-the-scenes', 'look', 'semi-fictional', 'york', 'symphony', 'orchestra']
['pareles', 'review',

Load and preprocess the testing set

In [7]:
f=open('C:\\tmp\\testing.txt','r')
lines=f.readlines()
testing=[]
for line in lines:
    article = line.replace('\n',' ').split('\t')
    text = article[0].lower()
    category = article[1]
    tokens = nltk.word_tokenize(text)
    key = str(apply_lemmatization(apply_stopwording(remove_punctuation(nltk.Text(tokens)), 3)))
    training[key]='NONE-'+category
    testing.append(key)
f.close

<function TextIOWrapper.close()>

## Creating TF-IDF weights

In [8]:
tfidf_vectorizer = TfidfVectorizer(min_df=1)
articles = training.keys()
tfidf = tfidf_vectorizer.fit_transform(articles)

In [31]:
print (tfidf[0])

  (0, 672)	0.244136553915
  (0, 982)	0.211548746862
  (0, 1511)	0.265935660043
  (0, 1040)	0.30320147811
  (0, 1069)	0.30320147811
  (0, 589)	0.18814464958
  (0, 503)	0.281402371982
  (0, 86)	0.265935660043
  (0, 115)	0.211548746862
  (0, 1409)	0.30320147811
  (0, 323)	0.30320147811
  (0, 1532)	0.253938746228
  (0, 838)	0.30320147811
  (0, 1298)	0.265935660043


In [34]:
cosine_similarity(tfidf[0], tfidf[5])

array([[ 0.03542598]])

# Sorting a dictionary based on its values

In [35]:
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=operator.itemgetter(1),reverse=True)
print (sorted_x[0][1])

4


# Using cosine similarity implement the KNN algorithm

In [40]:
def set_top(cosine,category,tops,k):
    keys = tops.keys()
    
    if len(keys)<k:
        tops[cosine] = category
    else:
        for key in keys:
            if cosine>key:
                tops.pop(key)
                tops[cosine] = category
            break;
    return tops

def get_category(neighbors):
    count={}
    for category in neighbors:
        if (category in count.keys()):
            count[category] += 1
        else:
            count[category] = 1
    
    sorted_count = sorted(count.items(), key=operator.itemgetter(1),reverse=True)
    print (sorted_count)
    return sorted_count[0][0]

def knn (training,testing, tfidf, k):
    for doc in testing:
        keys = training.keys()
        index_unknw = list(keys).index(doc)
        neighbors = {}
        for key in keys:
            if (not training[key].startswith('NONE')):
                index = list(keys).index(key)
                cosine = cosine_similarity(tfidf[index_unknw:index_unknw+1], tfidf[index:index+1])[0][0]
                neighbors = set_top(cosine,training[key],neighbors,k)
                
        for key in neighbors:
            print ('%s [%s]' % (key,neighbors[key]))
        print ('prediction = %s' % get_category(neighbors.values()))
        print ('category   = %s' % training[doc][5:])
        print ('\n')
        
knn(training,testing,tfidf,3)

0.10278515778 [SPORTS]
0.192238017693 [SPORTS]
0.115250344163 [SPORTS]
[('SPORTS', 3)]
prediction = SPORTS
category   = SPORTS 


0.0885157152753 [ARTS]
0.0988206600417 [SPORTS]
0.0822300749936 [SPORTS]
[('SPORTS', 2), ('ARTS', 1)]
prediction = SPORTS
category   = SPORTS 


0.0830415831227 [ARTS]
0.0706610457625 [SPORTS]
0.05103333634 [SPORTS]
[('SPORTS', 2), ('ARTS', 1)]
prediction = SPORTS
category   = SPORTS 


0.220457093854 [SPORTS]
0.151390567347 [SPORTS]
0.0919252018526 [SPORTS]
[('SPORTS', 3)]
prediction = SPORTS
category   = SPORTS 


0.0625619433805 [SPORTS]
0.0613347325872 [SPORTS]
0.080447248163 [SPORTS]
[('SPORTS', 3)]
prediction = SPORTS
category   = SPORTS 


0.0612025765068 [ARTS]
0.134909892245 [SPORTS]
0.0934321555493 [SPORTS]
[('SPORTS', 2), ('ARTS', 1)]
prediction = SPORTS
category   = ARTS 


0.0939350844199 [ARTS]
0.059172856273 [ARTS]
0.0691552934454 [ARTS]
[('ARTS', 3)]
prediction = ARTS
category   = ARTS 


0.0840197822632 [ARTS]
0.050857599278 [ARTS]
0.0806253