## Idea of the Repository: (Work in Progress)

The idea of the blog/repository is to:

1) Explore PCA for text visualizations -- words, sentences, documents.

2) Understand how using PCA helps improve text classification.

3) Explore the Spotify Annoy library and test the text classification improvement using the same.

4) (Optional) Explore the TSNE library for visualization and insights -- words, sentences, documents, text classification outputs.

## Importing Required Libraries

In [1]:
import os
import re
import math
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk import FreqDist
from nltk.util import ngrams
from functools import reduce
from collections import Counter
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, sent_tokenize
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

## Exploring the Data

In [15]:
from sklearn.datasets import fetch_20newsgroups

In [16]:
train = fetch_20newsgroups(subset="train") 
test = fetch_20newsgroups(subset="test")

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [27]:
train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [36]:
X_train = train["data"]
X_test = test["data"]
y_train = train["target"] 
y_test = test["target"]

In [37]:
df_train = pd.DataFrame(X_train, columns=['message'])

In [38]:
df_train['target'] = y_train

In [39]:
df_train.head()

Unnamed: 0,message,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [41]:
df_test = pd.DataFrame(X_test, columns = ['message'])
df_test['target'] = y_test

In [50]:
df_train['message'].iloc[1]

"From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 11\nNNTP-Posting-Host: carson.u.washington.edu\n\nA fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks.\n\nGuy Kuo <guykuo@u.washington.edu>\n"

In [74]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])'," ", text)
    return text

In [77]:
df_train['cleaned_message'] = df_train['message'].apply(lambda x: clean_text(x))

In [79]:
df_train.head()

Unnamed: 0,message,target,cleaned_message
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,from lerxst umd edu where s my thing subje...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,from guykuo u washington edu guy kuo subje...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,from twillis ecn purdue edu thomas e willis...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,from jgreen joe green subject re weitek ...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,from jcm cfa harvard edu jonathan mcdowell ...


In [80]:
df_train['len'] = df_train['cleaned_message'].apply(lambda x:len(x))

In [81]:
df_train.head()

Unnamed: 0,message,target,cleaned_message,len
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,from lerxst umd edu where s my thing subje...,718
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,from guykuo u washington edu guy kuo subje...,851
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,from twillis ecn purdue edu thomas e willis...,1976
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,from jgreen joe green subject re weitek ...,790
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,from jcm cfa harvard edu jonathan mcdowell ...,1098


In [None]:
# vectorizer = CountVectorizer(ngram_range=(2,2))

# file=open('data/test.ft.txt','r',encoding="utf-8")

# g=file.read()

# sentences=sent_tokenize(g)
# # 
# wn=WordNetLemmatizer()

In [None]:

# def preprocess_text(text):
#     tokens = word_tokenize(text)
#     stop_words=stopwords.words('english')
#     stop_words.extend(['__label__1','__label__2'])
#     tokens=[token.lower() for token in tokens if token not in stop_words]
#     tokens=[re.sub(r'[^A-Za-z]+','',token) for token in tokens]
#     tokens=[wn.lemmatize(token) for token in tokens]
#     return tokens  


# text_tokens=[]
# for item in sentences[0:1000]:
#     tokens = preprocess_text(item)
#     temp = " ".join(tokens)
#     text_tokens.append(temp)    



# word_dist = FreqDist()
# for s in text_tokens:
#     word_dist.update(s.split())

# ########################################################################################



# text=''
# for sent in text_tokens:
#     text=text+sent

# tokens=word_tokenize(text)    
# bigrams = ngrams(tokens,2)

# bigram_dict=dict(Counter(bigrams))

# final_bigram_dict={}
# for key,value in bigram_dict.items():
#     new_key=" ".join(key)
#     final_bigram_dict[new_key]=value

# unigram_index= CountVectorizer(ngram_range=(1,1))
# unigram_index.fit_transform(text_tokens)
# unigram_dist = unigram_index.vocabulary_


# def pmi(word1, word2 ,unigram_freq, bigram_freq):
#     #print(word1,word2)
#     prob_word1 = unigram_freq[word1]/float(sum(unigram_freq.values()))
#     #print(prob_word1)
#     prob_word2 = unigram_freq[word2]/float(sum(unigram_freq.values()))
#     #print(prob_word2)
#     prob_word1_word2 = bigram_freq[" ".join([word1,word2])]/float(sum(bigram_freq.values()))
#     #print(prob_word1_word2)
#     ratio = prob_word1_word2/float(prob_word1*prob_word2)
#     #print(word1,word2,prob_word1,prob_word2)
#     if ratio==0:
#         return 0
#     else:
#         return math.log(ratio,2)

# pmi_dict={}
# for key in final_bigram_dict.keys():
#     first_word = key.split()[0]
#     second_word = key.split()[1]
#     if (first_word in word_dist.keys()) and (second_word in word_dist.keys()):
#         pmi_dict[key]=pmi(key.split()[0],key.split()[1],word_dist,final_bigram_dict)
#     else:
#         pmi_dict[key]=0


# start = '\s'
# end= '\e'

    
# context_word_pairs={}
# for story_id in range(0,len(text_tokens)):
#     text_tokens[story_id] = start +' '+ text_tokens[story_id] +' '+ end
#     list_of_words=text_tokens[story_id].split()
#     context_word_pairs[story_id] = {}
#     for word_index in range(1,len(list_of_words)-1):
#         context_word_pairs[story_id][list_of_words[word_index]]=[list_of_words[word_index-1],list_of_words[word_index+1]]
# #    
# #tokens=[]
# #for s in text_tokens:
# #    tokens.extend(word_tokenize(s))
# #
# #unique_words=set(tokens)
# #
# list_cw_pairs=[]
# for i in range(0,len(context_word_pairs)):
#     list_cw_pairs.append(context_word_pairs[i])
# #
# #for word in unique_words:
# #    print(context_word_pairs.get(word))
    


# def foo(r, d):
#     for k in d:
#         r[k].append(d[k])
    
# d = reduce(lambda r, d: foo(r, d) or r, list_cw_pairs, defaultdict(list))    
    
    
# final_dict={}
# for k,v in d.items():
#     tmp_list_before=[]
#     tmp_list_after=[]
#     for x in range(0,len(v)):
#         tmp_list_before.append(v[x][0])
#         tmp_list_after.append(v[x][1])
#     final_dict[k]=[tmp_list_before,tmp_list_after]
    
# def create_vectors(word,context):
#     #print(word)
#     #print(context)
#     vector = np.zeros((len(word_dist.keys()),)) 
#     for x in range(0,len(context[0])):
#         temp_word_1 = context[0][x]+" " + word
#         if temp_word_1 in pmi_dict.keys():
#             vector[word_dist[context[0][x]]] = pmi_dict[temp_word_1]
#     for y in range(0,len(context[1])):
#         temp_word_2 = word + " " + context[1][y]
#         if temp_word_2 in pmi_dict.keys():
#             vector[word_dist[context[1][y]]] = pmi_dict[temp_word_2]
#     return vector

# word_vectors=[]
# word_list=[]
# for w,v in final_dict.items():
#     if w!="":
#         word_vectors.append(create_vectors(w,v))
#         word_list.append(w)
 
 
        
        


# pca=PCA(n_components=300)

# principalComponents=pca.fit_transform(word_vectors)

# '''
# new_pca = PCA(n_components = 2)
# n=new_pca.fit_transform(principalComponents)
# plt.scatter(n[:,0],n[:,1])

# for i,word in enumerate(word_list[0:300]):
#     plt.annotate(word,xy=(n[i,0],n[i,1]))
# plt.show()

# '''
# labels = []
# tokens = []

# for word in range(0,len(word_list)):
#     tokens.append(word_vectors[word])
#     labels.append(word_list[word])

# tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
# new_values = tsne_model.fit_transform(tokens)

# x = []
# y = []
# for value in new_values:
#     x.append(value[0])
#     y.append(value[1])
    
# plt.figure(figsize=(16, 16)) 
# for i in range(len(x)):
#     plt.scatter(x[i],y[i])
#     plt.annotate(labels[i],
#                  xy=(x[i], y[i]),
#                  xytext=(5, 2),
#                  textcoords='offset points',
#                  ha='right',
#                  va='bottom')
# plt.show()
# '''
# from annoy import AnnoyIndex

# num=300
# t = AnnoyIndex(num)

# for i in range(0,len(principalComponents)):
#     t.add_item(i,principalComponents[i])
    
# t.build(10)
    
# print(word_list[0])
# #print(t.get_nns_by_item(0,5))
# for i in t.get_nns_by_item(0,5):
#     print(word_list[i])
# #print(t.get_distance(1,4))

# #print(t.get_distance(18,24))
