In [62]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import util
from nltk.corpus import stopwords
import pandas as pd



In [63]:
def create_stem_cache(cleaned_description):
    tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s))
    concated = np.unique(np.concatenate([s for s in tokenized_description.values]))
    stem_cache = {}
    ps = PorterStemmer()
    for s in concated:
        stem_cache[s] = ps.stem(s)
    return stem_cache

In [64]:
def create_custom_preprocessor(stop_dict, stem_cache):
    def custom_preprocessor(s) :
        ps = PorterStemmer()
        s = re.sub(r'[^A-Za-z]', ' ', s)
        s = re.sub(r'\s+', ' ' , s)
        s = word_tokenize(s)
        s = list(set(s) - stop_dict)
        s = [word for word in s if len(word) > 2]
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s]
        s = ' '.join(s)
        return s
    return custom_preprocessor

In [65]:
def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_processor)
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    print(len(vectorizer.get_feature_names_out()))
    print(vectorizer.inverse_transform(query))

In [66]:
cleaned_description = util.get_and_clean_data()
stem_cache = create_stem_cache(cleaned_description)
stop_dict = set(stopwords.words('English'))
# Python, Java, Simpler. Ordering is not included 


In [67]:
sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict,stem_cache)


  (0, 13947)	1
  (0, 21383)	1
  (0, 24234)	1
  (1, 13947)	1
  (1, 21383)	1
  (1, 24234)	1
30513
[array(['java', 'python', 'simpler'], dtype='<U124'), array(['java', 'python', 'simpler'], dtype='<U124')]


# Ngram Technique

In [68]:
my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)

In [69]:
unigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(1,1))
unigram_vectorizer.fit(cleaned_description)
print(len(unigram_vectorizer.get_feature_names_out()))

30513


In [70]:
bigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(2,2))
bigram_vectorizer.fit(cleaned_description)
print(len(bigram_vectorizer.get_feature_names_out()))

304950


In [71]:
trigram_vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(3,3))
trigram_vectorizer.fit(cleaned_description)
print(len(trigram_vectorizer.get_feature_names_out()))

731890


In [72]:
# DF displays the frequency the token appear
X = unigram_vectorizer.transform(cleaned_description)
X_df = pd.DataFrame(X.toarray(), columns= unigram_vectorizer.get_feature_names_out())
X_df[X_df.sum().sort_values()[-10:].index].iloc[:5]

Unnamed: 0,experi,system,technolog,team,test,design,requir,work,applic,develop
0,1,0,1,1,3,2,1,1,2,3
1,1,0,2,1,0,1,1,1,1,2
2,1,1,0,1,2,0,1,2,2,2
3,1,1,0,2,1,0,0,1,0,2
4,1,0,2,1,0,1,1,1,1,2


In [73]:
def bigram_sk_vectorize(texts, cleaned_description, stop_dict, stem_cache):
    my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)
    vectorizer = CountVectorizer(preprocessor=my_custom_processor, ngram_range=(1,2))
    vectorizer.fit(cleaned_description)
    query = vectorizer.transform(texts)
    print(query)
    # print(vectorizer.inverse_transform(query))
    return vectorizer.inverse_transform(query)

Y = bigram_sk_vectorize(['python is simpler than java', 'java is simpler than python'], cleaned_description, stop_dict,stem_cache)
# Y_df[Y_df.sum().sort_values()[-10:].index].iloc[:5]

  (0, 158640)	1
  (0, 239528)	1
  (0, 273730)	1
  (1, 158640)	1
  (1, 239528)	1
  (1, 273730)	1


In [74]:
# Magic! Ordering still be problem but we can count the nearby token by using ngram technique
Y_df = pd.DataFrame(Y)
Y_df


Unnamed: 0,0,1,2
0,java,python,simpler
1,java,python,simpler


# tf technique

In [75]:
import scipy.sparse as sparse
def log10_1p(x: np.matrix) -> np.matrix:
   return np.log1p(x) / np.log(10)

X = unigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)

# Weight of word 
df = X.sum(axis=0)
idf = log10_1p((N / df))
tf = log10_1p(X)

X = tf.multiply(idf)

# print(X.toarray())

X_df = pd.DataFrame(X.toarray(), columns=unigram_vectorizer.get_feature_names_out())
max_term = X_df.sum().sort_values()[-20:].sort_index().index
X_df[max_term]

Unnamed: 0,applic,code,design,experi,includ,job,requir,respons,servic,skill,softwar,solut,support,system,team,technolog,test,use,work,year
0,0.100111,0.163237,0.125068,0.090182,0.098447,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.088475,0.172229,0.147954,0.063584,0.154111
1,0.063163,0.000000,0.078909,0.090182,0.000000,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.140229,0.000000,0.000000,0.063584,0.097233
2,0.100111,0.163237,0.000000,0.090182,0.000000,0.122499,0.068209,0.000000,0.000000,0.105038,0.09451,0.109941,0.000000,0.089253,0.087032,0.000000,0.136488,0.000000,0.100779,0.097233
3,0.000000,0.163237,0.000000,0.090182,0.000000,0.122499,0.000000,0.000000,0.000000,0.000000,0.09451,0.109941,0.101059,0.089253,0.137942,0.000000,0.086114,0.000000,0.063584,0.000000
4,0.063163,0.000000,0.078909,0.090182,0.000000,0.122499,0.068209,0.094237,0.105238,0.105038,0.09451,0.109941,0.000000,0.000000,0.087032,0.140229,0.000000,0.000000,0.063584,0.097233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7578,0.100111,0.163237,0.078909,0.090182,0.196894,0.000000,0.108109,0.149363,0.105238,0.105038,0.09451,0.109941,0.160174,0.141463,0.087032,0.140229,0.136488,0.000000,0.100779,0.097233
7579,0.126326,0.102991,0.125068,0.000000,0.098447,0.122499,0.108109,0.094237,0.105238,0.105038,0.09451,0.174253,0.202117,0.141463,0.087032,0.140229,0.086114,0.000000,0.063584,0.097233
7580,0.063163,0.000000,0.000000,0.090182,0.098447,0.122499,0.068209,0.000000,0.105238,0.105038,0.09451,0.000000,0.101059,0.089253,0.087032,0.088475,0.086114,0.000000,0.000000,0.097233
7581,0.063163,0.102991,0.125068,0.090182,0.098447,0.122499,0.108109,0.094237,0.105238,0.000000,0.09451,0.109941,0.101059,0.089253,0.137942,0.140229,0.000000,0.000000,0.100779,0.000000


In [76]:


X = bigram_vectorizer.transform(cleaned_description)
N = len(cleaned_description)

# Weight of word 
df = X.sum(axis=0)
idf = log10_1p((N / df))
tf = log10_1p(X)

X = tf.multiply(idf)

In [77]:
X_df = pd.DataFrame.sparse.from_spmatrix(X, columns=bigram_vectorizer.get_feature_names_out())
max_term = X_df.sum().sort_values()[-20:].index
# print(max_term)
X_df[max_term].sort_values(max_term[0], ascending=False)[:20]

Unnamed: 0,inform nation,bachelor softwar,applic standard,understand technolog,receiv origin,sql well,comput religion,excel applic,opportun applic,develop compani,prefer design,team qualif,experi equal,orient develop,develop year,develop test,respons develop,softwar work,abil includ,system system
4832,0.288971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252367,0.0,0.0,0.0,0.0,0.239765,0.0,0.0,0.0,0.0,0.204894,0.202094
6831,0.288971,0.0,0.0,0.276092,0.273807,0.268665,0.263382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2689,0.288971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252367,0.0,0.0,0.0,0.243494,0.239765,0.0,0.0,0.0,0.205726,0.0,0.202094
856,0.288971,0.0,0.0,0.0,0.273807,0.0,0.263382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209029,0.0,0.0,0.202094
6824,0.288971,0.0,0.0,0.0,0.0,0.268665,0.263382,0.0,0.252367,0.0,0.0,0.0,0.0,0.239765,0.222942,0.215742,0.0,0.0,0.204894,0.202094
4633,0.288971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215742,0.0,0.0,0.0,0.202094
4632,0.288971,0.0,0.0,0.0,0.273807,0.268665,0.0,0.257081,0.0,0.0,0.0,0.0,0.0,0.239765,0.0,0.0,0.0,0.0,0.204894,0.0
5241,0.288971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.252367,0.0,0.248369,0.246226,0.243494,0.239765,0.0,0.215742,0.0,0.0,0.0,0.0
3377,0.288971,0.285418,0.0,0.0,0.0,0.0,0.263382,0.257081,0.252367,0.0,0.0,0.246226,0.243494,0.0,0.0,0.0,0.0,0.205726,0.0,0.0
4637,0.288971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.243494,0.239765,0.0,0.0,0.209029,0.0,0.204894,0.0


## Sum-up
Bigram has more data than Unigram