In [8]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 
import util
from nltk.corpus import stopwords
import pandas as pd



In [9]:
def create_stem_cache(cleaned_description):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

In [10]:
def create_custom_preprocessor(stop_dict, stem_cache):
    def custom_preprocessor(s) :
        ps = PorterStemmer()
        s = re.sub(r'[^A-Za-z]', ' ', s)
        s = re.sub(r'\s+', ' ' , s)
        s = word_tokenize(s)
        s = list(set(s) - stop_dict)
        s = [word for word in s if len(word) > 2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = ' '.join(s)
        return s
    return custom_preprocessor

In [11]:
def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_processor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    # print(query)
    print(len(vectorizer.get_feature_names_out()))
    print(vectorizer.inverse_transform(query))
    return vectorizer.inverse_transform(query)

In [12]:
cleaned_description = util.get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
# Python, Java, Simpler. Ordering is not included 


In [13]:
sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict,stem_cache)


30513
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


[array(['java', 'python', 'simpler'], dtype='<U124'),
 array(['java', 'python', 'simpler'], dtype='<U124')]

# Ngram Technique

In [14]:
my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)

In [15]:
unigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(1,1))
unigram_vectorizer.fit(cleaned_description)
print(len(unigram_vectorizer.get_feature_names_out()))

30513


In [16]:
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(2,2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

303254


In [17]:
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(3,3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))

731544


In [18]:
# DF displays the frequency the token appear
X = unigram_vectorizer.transform(cleaned_description)
X_df = pd.DataFrame(X.toarray(), columns= unigram_vectorizer.get_feature_names_out())
X_df[X_df.sum().sort_values()[-10:].index].iloc[:5]

Unnamed: 0,experi,system,technolog,team,test,design,requir,work,applic,develop
0,1,0,1,1,3,2,1,1,2,3
1,1,0,2,1,0,1,1,1,1,2
2,1,1,0,1,2,0,1,2,2,2
3,1,1,0,2,1,0,0,1,0,2
4,1,0,2,1,0,1,1,1,1,2


In [19]:
def bigram_sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(1,2))
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    # print(vectorizer.inverse_transform(query))
    return vectorizer.inverse_transform(query)

Y = bigram_sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict,stem_cache)
# Y_df[Y_df.sum().sort_values()[-10:].index].iloc[:5]

  (0, 158418)	1
  (0, 158607)	1
  (0, 237984)	1
  (0, 272230)	1
  (1, 158418)	1
  (1, 158607)	1
  (1, 237984)	1
  (1, 272230)	1


In [20]:
# Magic! Ordering still be problem but we can count the nearby token by using ngram technique
Y_df = pd.DataFrame(Y)
Y_df


Unnamed: 0,0,1,2,3
0,java,java python,python,simpler
1,java,java python,python,simpler


# tf technique

In [21]:
import scipy.sparse as sparse
def log10_1p(x: np.matrix) -> np.matrix:
   return np.log1p(x) / np.log(10)

X = unigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)

# Weight of word 
df = X.sum(axis=0)
idf = log10_1p((N / df))
tf = log10_1p(X)

X = tf.multiply(idf)

# print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=unigram_vectorizer.get_feature_names_out())
max_term = X_df.sum().sort_values()[-20:].sort_index().index
X_df[max_term]

Unnamed: 0,applic,code,design,experi,includ,job,requir,respons,servic,skill,softwar,solut,support,system,team,technolog,test,use,work,year
0,0.100111,0.163237,0.125068,0.090182,0.098447,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.088475,0.172229,0.147954,0.063584,0.154111
1,0.063163,0.000000,0.078909,0.090182,0.000000,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.140229,0.000000,0.000000,0.063584,0.097233
2,0.100111,0.163237,0.000000,0.090182,0.000000,0.122499,0.068209,0.000000,0.000000,0.105038,0.09451,0.109941,0.000000,0.089253,0.087032,0.000000,0.136488,0.000000,0.100779,0.097233
3,0.000000,0.163237,0.000000,0.090182,0.000000,0.122499,0.000000,0.000000,0.000000,0.000000,0.09451,0.109941,0.101059,0.089253,0.137942,0.000000,0.086114,0.000000,0.063584,0.000000
4,0.063163,0.000000,0.078909,0.090182,0.000000,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.140229,0.000000,0.000000,0.063584,0.097233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7578,0.100111,0.163237,0.078909,0.090182,0.196894,0.000000,0.108109,0.149363,0.105238,0.105038,0.09451,0.109941,0.160174,0.141463,0.087032,0.140229,0.136488,0.000000,0.100779,0.097233
7579,0.126326,0.102991,0.125068,0.000000,0.098447,0.122499,0.108109,0.094237,0.105238,0.105038,0.09451,0.174253,0.202117,0.141463,0.087032,0.140229,0.086114,0.000000,0.063584,0.097233
7580,0.063163,0.000000,0.000000,0.090182,0.098447,0.122499,0.068209,0.000000,0.105238,0.105038,0.09451,0.000000,0.101059,0.089253,0.087032,0.088475,0.086114,0.000000,0.000000,0.097233
7581,0.063163,0.102991,0.125068,0.090182,0.098447,0.122499,0.108109,0.094237,0.105238,0.000000,0.09451,0.109941,0.101059,0.089253,0.137942,0.140229,0.000000,0.000000,0.100779,0.000000


In [22]:
X = bigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)

# Weight of word 
df = X.sum(axis=0)
idf = log10_1p((N / df))
tf = log10_1p(X)

X = tf.multiply(idf)

In [23]:
X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=bigram_vectorizer.get_feature_names_out())
max_term = X_df.sum().sort_values()[-20:].index
# print(max_term)
X_df[max_term].sort_values(max_term[0], ascending=False)[:20]

Unnamed: 0,team year,relat appli,softwar abil,support candid,servic experi,databas system,inform knowledg,experi servic,orient protect,religion nation,develop solut,regard race,use requir,maintain softwar,applic team,softwar work,develop commun,abil work,design includ,experi work
1133,0.285538,0.0,0.0,0.0,0.0,0.0,0.0,0.265641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1242,0.285538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249881,0.248452,0.244689,0.0,0.0,0.232864,0.0,0.0,0.20941,0.205361,0.174504
1252,0.285538,0.0,0.0,0.283036,0.0,0.267545,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.225454,0.0,0.0,0.205361,0.174504
6478,0.285538,0.0,0.0,0.0,0.0,0.267545,0.0,0.0,0.0,0.0,0.0,0.0,0.238704,0.237656,0.0,0.225454,0.0,0.0,0.0,0.0
3057,0.285538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238704,0.0,0.0,0.0,0.0,0.0,0.0,0.174504
4885,0.285538,0.0,0.0,0.0,0.0,0.0,0.266538,0.265641,0.0,0.0,0.0,0.0,0.0,0.237656,0.0,0.0,0.0,0.0,0.0,0.0
4884,0.285538,0.285538,0.28422,0.0,0.0,0.267545,0.0,0.0,0.253327,0.0,0.248452,0.0,0.0,0.0,0.0,0.0,0.0,0.20941,0.205361,0.174504
6483,0.285538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.211675,0.20941,0.0,0.0
4883,0.285538,0.285538,0.28422,0.0,0.0,0.267545,0.0,0.0,0.253327,0.0,0.248452,0.0,0.0,0.0,0.0,0.0,0.0,0.20941,0.205361,0.174504
4881,0.285538,0.285538,0.28422,0.0,0.0,0.267545,0.0,0.0,0.253327,0.0,0.248452,0.0,0.0,0.0,0.0,0.0,0.0,0.20941,0.205361,0.174504
