In [None]:
# Importing libraries
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Loading data
df_text_raw = pd.read_csv('https://raw.githubusercontent.com/myamullaciencia/open_into_datos/master/lsh_assignment_data.csv')

In [None]:
# view the first 5 observations of a dataframe
df_text_raw.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [None]:
# Counts per each category
df_text_raw['category'].value_counts()

sport            509
business         508
politics         415
tech             399
entertainment    384
Name: category, dtype: int64

In [None]:
# Creating training and testing datasets
train_text_data = df_text_raw.iloc[:-10,:]
test_text_data = df_text_raw.iloc[-10:,:]

In [None]:
print(f'The training dataset contains row and cols as: {train_text_data.shape} \nand The testing dataset contains rows and cols as: {test_text_data.shape}')

The training dataset contains row and cols as: (2215, 2) 
and The testing dataset contains rows and cols as: (10, 2)


In [None]:
# create a TFIDF vectorizer with the recommended options
text_vectorizer = TfidfVectorizer(ngram_range=(2,3),max_features=4000,min_df=10)

In [None]:
print(f'The TFIDF Vectorizer as: \n{text_vectorizer}')

The TFIDF Vectorizer as: 
TfidfVectorizer(max_features=4000, min_df=10, ngram_range=(2, 3))


In [None]:
# fitting vectorizer on train data
my_text_model_fit = text_vectorizer.fit(train_text_data['text'])

In [None]:
# transforming train and test text features
my_text_features= my_text_model_fit.transform(train_text_data['text'])
my_text_features_test = my_text_model_fit.transform(test_text_data['text'])

In [None]:
print(f'TF-IDFs as: \n{my_text_features.toarray()}')

In [None]:
def generate_hyperplanes(n,tot):
    """ Custom function to generate required hyperplanes """
    np.random.seed(0)
    hyper_array=[]
    for _ in range(0,n):
        hyper_array.append(np.random.normal(0,1,tot))
    return np.array(hyper_array)

In [None]:
# Creating five hyperplanes
hypers = generate_hyperplanes(5,my_text_features.shape[1])

In [None]:
def wt_trans_x(sparse_mat,hyper_array):
    """ Custom function to caluclate W_Trans_X """
    trans_list = list()
    for fet in sparse_mat:
        trans_list.append(fet.dot(hyper_array.T).tolist()[0])
    return trans_list

In [None]:
def hash_key(vector):
    """Generate a hashkey tupple with 1's and 0's"""
    key = tuple(map(lambda x: 1 if x>0 else 0,vector))
    return key

In [None]:
def create_hash_key(vec):
    """create WtansX and generate hashkey on it"""
    wt_x_vec= wt_trans_x(vec,hypers)
    hk = hash_key(wt_x_vec[0])
    return hk

In [None]:
def create_hash_table(arr):
    """Generate a hashtable"""
    my_hash_table=dict()
    for idx,vec in enumerate(arr):
        key_gen=hash_key(vec)
        if key_gen not in my_hash_table.keys():
            my_hash_table[key_gen]=0
        my_hash_table[key_gen]=[]

    for idx,vec in enumerate(arr):
        key_gen=hash_key(vec)
        if key_gen in my_hash_table.keys():
            my_hash_table[key_gen].append(idx)
    return my_hash_table

In [None]:
# Caluclating W_trans_X on training features and creating a hashtable on it
x_train = wt_trans_x(my_text_features,hypers)
x_train_hast_table = create_hash_table(x_train)

In [None]:
def pred_nearest_neighbor_lsh_labels(train_data,train_features,test_features,x_hash_table,num_of_nbrs):

    """Custom function to caluclate cosine similarities, find the required NNBs labels for the given train and test datasets"""
    #list to store indices of the required NNB's
    label_idx=list()
    # a dict to store the counted predicted labels using the indices
    label_pred_dict=dict()
    # a list to store the finalized predicted label
    pred_labels=list()

    for fet in test_features:
        key_gen = create_hash_key(fet)
        neighbours_x = x_hash_table[key_gen]
        neighbours_x_arr = np.array(neighbours_x)
        cosine_similarities=[]
        for nbr in neighbours_x_arr:
            cos_sim=np.dot(train_features[nbr],fet.T).todense().item()/(norm(train_features[nbr].toarray())*norm(fet.T.toarray()))
            cosine_similarities.append(cos_sim)
        n_11_neighbors=neighbours_x_arr[np.argsort(cosine_similarities)[::-1][:num_of_nbrs]]
        label_idx.append(n_11_neighbors)

    for idx,item in enumerate(label_idx):
        label_pred_dict[idx]=Counter(list(train_data.iloc[item,0]))

    for labels in label_pred_dict.values():
        pred_labels.append(max(labels,key=lambda x:labels[x]))

    return pred_labels

In [None]:
# predicting labels of test data by providing training data text features
my_pred_labels = pred_nearest_neighbor_lsh_labels(train_text_data,my_text_features,my_text_features_test,x_train_hast_table,11)

In [None]:
my_pred_labels

In [None]:
###########################################
## GRADER CELL: Do NOT Change this.
# This cell will print "Success" if your implmentation of the predictLabels() is correct and the accuracy obtained is above 80%.
# Else, it will print "Failed"
###########################################


# custom array to store the predicted labels
Y_custom = np.array(my_pred_labels)

# Reference grader array - DO NOT MODIFY IT
Y_grader = np.array(['tech', 'entertainment', 'tech', 'sport', 'business', 'business', 'politics', 'entertainment', 'politics', 'sport'])

# Calculating accuracy by comparing Y_grader and Y_custom
accuracy = np.sum(Y_grader==Y_custom) * 10

if accuracy >= 80:
  print("******** Success ********","Accuracy Achieved = ", accuracy,'%')
else:
  print("####### Failed #######","Accuracy Achieved = ", accuracy,'%')
  print("\nY_grader = \n\n", Y_grader)
  print("\n","*"*50)
  print("\nY_custom = \n\n", Y_custom)


******** Success ******** Accuracy Achieved =  90 %


In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)

(4, 16)


In [None]:
print(X[[1]])

  (0, 0)	-0.8164965809277261
  (0, 11)	0.4082482904638631
  (0, 13)	0.4082482904638631
  (0, 14)	0.0


In [None]:
for item in X:
    print(item)