# Feature Engineering

This notebook prepares data for model training.

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
cd drive/MyDrive/ML_Trending_Topics/

[Errno 2] No such file or directory: 'drive/MyDrive/ML_Trending_Topics/'
/content/drive/MyDrive/ML_Trending_Topics


In [58]:
pip install mlxtend


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [59]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from mlxtend.preprocessing import DenseTransformer
from sklearn.cluster import KMeans
from tqdm import tqdm

## Load Data

In [60]:
# load pre-processed data
df = pd.read_csv('extracted_files/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2021,2021,566.928668,January,"Motion segmentation,Computer vision,Transmissi...",Many real-world video sequences cannot be conv...,Many real world video sequence cannot convenie...,1594,"['Motion segmentation', 'Computer vision', 'Tr...",11,1
1,2021,2021,566.928668,January,"Generative adversarial networks,Generators,Gal...",Generative adversarial networks (GAN) are trai...,Generative adversarial network GAN trained m...,955,"['Generative adversarial networks', 'Generator...",11,1
2,2021,2021,566.928668,January,"Convolution,Task analysis,Image resolution,Acc...",Many different deep networks have been used to...,Many different deep network used approximate ...,1393,"['Convolution', 'Task analysis', 'Image resolu...",11,1
3,2021,2021,566.928668,January,"Ellipsoids,Shape,Rendering ,computer graphics,...","This paper presents a precise, stable, and inv...",This paper present precise stable invertible...,914,"['Ellipsoids', 'Shape', 'Rendering ', 'compute...",12,1
4,2021,2021,566.928668,January,"Bayes methods,Principal component analysis,Ada...",Robust tensor factorization is a fundamental p...,Robust tensor factorization fundamental proble...,1300,"['Bayes methods', 'Principal component analysi...",11,1


### Improved - Keyword Cleaning

In [61]:
keywords = (df.Keywords_Cleaned).tolist()

In [62]:
keywords_cleaned = []
a = []
for i in range(0,len(keywords)):
  a = []
  for j in range(0,len(keywords[i].split(','))):
    a.append(keywords[i].strip('[]').split(',')[j].strip(" '"))
  keywords_cleaned.append(a)

In [63]:
keywords_cleaned[1]

['Generative adversarial networks',
 'Generators',
 'Gallium nitride',
 'Logistics',
 'Training',
 'Image generation',
 'Random variables',
 'Generative adversarial models',
 'functional gradient learning',
 'neural networks',
 'image generation']

In [64]:
# temp = []
# for i in range(0,len(keywords_cleaned)):
#   a = []
#   for j in range(0,len(keywords_cleaned[i])):
#     a = a + keywords_cleaned[i][j].split()
#   temp.append(a)

In [65]:
# keywords_cleaned = temp

In [66]:
# keywords_cleaned[1]

### Improved - Abstract Tokenization

In [67]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [68]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [69]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
abstracts = (df.Abstract_Cleaned).tolist()

In [71]:
# abs_cleaned = []
# for i in abstracts:
#   a = word_tokenize(i)
#   b = [word for word in a if not word in stopwords.words()]
#   abs_cleaned.append(' '.join(b))

In [72]:
abstracts = [sentence.split() for sentence in abstracts]

In [73]:
len(abstracts)

4604

In [74]:
pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


##Build abstract vector using BERT

1. Abstract embedding

In [75]:
'''from re import IGNORECASE
# BERT embedding
from sentence_transformers import SentenceTransformer

try :
  model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
  doc_embedding = model.encode(abstracts)
except:
  IGNORECASE'''

"from re import IGNORECASE\n# BERT embedding\nfrom sentence_transformers import SentenceTransformer\n\ntry :\n  model = SentenceTransformer('paraphrase-MiniLM-L6-v2')\n  doc_embedding = model.encode(abstracts)\nexcept:\n  IGNORECASE"

In [76]:
#doc_embedding

In [77]:
#doc_embedding.shape

In [78]:
'''# build a data pipeline
pipeline = Pipeline([('svd', TruncatedSVD(n_components=10, random_state=42)),     # components reduction
                     ('to_dense', DenseTransformer())])                           # data transform

# build data vector representation of abstracts
x_vector = pipeline.fit_transform(doc_embedding)'''

"# build a data pipeline\npipeline = Pipeline([('svd', TruncatedSVD(n_components=10, random_state=42)),     # components reduction\n                     ('to_dense', DenseTransformer())])                           # data transform\n\n# build data vector representation of abstracts\nx_vector = pipeline.fit_transform(doc_embedding)"

In [79]:
#x_vector.shape

In [80]:
'''# normalize x_vector
x_vector = Normalizer().fit_transform(x_vector)
df_x_vector = pd.DataFrame(x_vector, index=None)
df_x_vector = df_x_vector / df_x_vector.std()
x_vector = df_x_vector.to_numpy()

# display x_vector
x_vector'''

'# normalize x_vector\nx_vector = Normalizer().fit_transform(x_vector)\ndf_x_vector = pd.DataFrame(x_vector, index=None)\ndf_x_vector = df_x_vector / df_x_vector.std()\nx_vector = df_x_vector.to_numpy()\n\n# display x_vector\nx_vector'

In [81]:
#x_vector.shape

In [82]:
'''# save x_vector for model development
with open('x_vector.npy', 'wb') as file:
    np.save(file, x_vector)'''

"# save x_vector for model development\nwith open('x_vector.npy', 'wb') as file:\n    np.save(file, x_vector)"

2. Combined Embedding

In [83]:
tokenized = []

In [84]:
for i in range(0,len(abstracts)):
  tokenized.append(list(set(abstracts[i] + keywords_cleaned[i])))
# ----------------------------------------------
# for i in range(0,len(keywords_cleaned)):
#   tokenized.append(list(set(keywords_cleaned[i])))

In [85]:
tokenized = [[x for x in lst if x] for lst in tokenized]

In [86]:
tokenized

[['containing',
  'perform',
  'multiview learning',
  'conveniently',
  'motion segmentation',
  'confronted',
  'referred',
  'degenerate',
  'framework',
  'difficulty',
  'independently',
  'problem',
  'achieving',
  'them',
  'improved',
  'essentially',
  'data',
  'also',
  'information',
  'object',
  'forward',
  'dataset',
  'estimated',
  'adapted',
  'discus',
  'potential',
  'world',
  'together',
  'real',
  'KITTI',
  'multiple',
  'tasks',
  'For',
  'criterion',
  'of',
  'testing',
  'challenging',
  'set',
  'often',
  'model selection',
  'Threedimensional displays',
  'forth',
  'research',
  'homography',
  'would',
  'propose',
  'work',
  'Adaptation models',
  'In',
  'categorized',
  'paper',
  'cases',
  'number',
  'realistic',
  'We',
  'model',
  'several',
  'false',
  'the',
  'complexity',
  'realized',
  'considerations',
  'suffers',
  'harness',
  'effect',
  'unknown',
  'still',
  'state',
  'put',
  'combine',
  'benchmark',
  'performance',
  '

In [87]:
from re import IGNORECASE
# BERT embedding
from sentence_transformers import SentenceTransformer


model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
tok_embedding = model.encode(tokenized)

In [88]:
tok_embedding

array([[ 0.11513356,  0.46989807,  0.14632007, ..., -0.16158095,
         1.1907963 , -0.00831211],
       [ 0.08166245,  0.57499087, -0.07233939, ...,  0.07028683,
         0.43576398, -0.1019099 ],
       [-0.13500656,  0.45277184,  0.17979583, ..., -0.39921564,
        -0.13462426,  0.37231073],
       ...,
       [-0.20381372, -0.40962538, -0.10468902, ..., -0.13807276,
         0.36705306, -0.5386535 ],
       [ 0.3611882 ,  0.38009053, -0.53395313, ...,  0.44646454,
         0.21628161,  0.3942225 ],
       [ 0.18488294, -0.2518096 ,  0.02634307, ...,  0.13250493,
         0.058273  ,  0.18340962]], dtype=float32)

In [89]:
# build a data pipeline
pipeline = Pipeline([('svd', TruncatedSVD(n_components=10, random_state=42)),     # components reduction
                     ('to_dense', DenseTransformer())])                           # data transform

# build data vector representation of abstracts
x_vector = pipeline.fit_transform(tok_embedding)

In [90]:
x_vector.shape

(4604, 10)

In [91]:
# normalize x_vector
x_vector = Normalizer().fit_transform(x_vector)
df_x_vector = pd.DataFrame(x_vector, index=None)
df_x_vector = df_x_vector / df_x_vector.std()
x_vector = df_x_vector.to_numpy()

# display x_vector
x_vector

array([[ 5.7877073 , -1.2808461 ,  0.30172443, ...,  1.2249545 ,
         0.8514291 , -1.0591381 ],
       [ 3.9428904 ,  0.06019333, -0.6484549 , ...,  1.7985374 ,
        -0.18234675,  0.37881103],
       [ 5.0641937 , -0.28964907, -0.2586722 , ..., -0.41435927,
        -0.5494748 ,  0.81200343],
       ...,
       [ 4.318283  ,  0.44188514, -1.1331853 , ..., -0.6678869 ,
         2.1000936 ,  0.01697124],
       [ 3.1724648 , -0.45694104,  1.5345641 , ...,  1.0362966 ,
        -2.5506935 ,  0.00665936],
       [ 3.4736555 ,  1.1121761 ,  0.8411666 , ...,  0.6281944 ,
        -0.2770136 ,  0.05307548]], dtype=float32)

In [92]:
x_vector.shape

(4604, 10)

In [93]:
# save x_vector for model development
with open('extracted_files/x_vector.npy', 'wb') as file:
    np.save(file, x_vector)

## Built sparse matrix

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse
import json

# create CountVectorizer object
vectorizer = CountVectorizer()

# check if Abstract_Cleaned is a list or iterable of strings
if not all(isinstance(text, str) for text in df.Keywords_Cleaned):
    raise TypeError("Keywords_Cleaned should be a list of strings")

# fit the vectorizer on the text data
vectorizer.fit(df.Keywords_Cleaned)

# build terms matrix for entire corpus
terms_sparse_matrix = vectorizer.transform(df.Keywords_Cleaned)

# save terms sparse matrix
scipy.sparse.save_npz('extracted_files/terms_sparse_matrix.npz', terms_sparse_matrix)

# get term label for each item in the term matrix and save result to a text file
terms_label = vectorizer.get_feature_names_out()

# save term labels
with open("extracted_files/terms_label.txt", "w") as fp:
    json.dump(list(terms_label), fp)

In [95]:
terms_sparse_matrix

<4604x14669 sparse matrix of type '<class 'numpy.int64'>'
	with 70455 stored elements in Compressed Sparse Row format>

In [96]:
type(terms_sparse_matrix)

scipy.sparse._csr.csr_matrix

In [97]:
terms_label

array(['197942', '1class', '1d', ..., 'ℓ1regularization', 'ℓ1regularized',
       'ℓnorm'], dtype=object)

In [98]:
len(terms_label)

14669