# Features Extraction

This notebook performs feature extraction on preprocessed journal data.

In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
cd drive/MyDrive/ML_Trending_Topics/

[Errno 2] No such file or directory: 'drive/MyDrive/ML_Trending_Topics/'
/content/drive/MyDrive/ML_Trending_Topics


In [83]:
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [72]:
df = pd.read_csv('extracted_files/data_cleaned.csv')
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2021,2021,566.928668,January,"Motion segmentation,Computer vision,Transmissi...",Many real-world video sequences cannot be conv...,Many real world video sequence cannot convenie...,1594,"['Motion segmentation', 'Computer vision', 'Tr...",11,1
1,2021,2021,566.928668,January,"Generative adversarial networks,Generators,Gal...",Generative adversarial networks (GAN) are trai...,Generative adversarial network GAN trained m...,955,"['Generative adversarial networks', 'Generator...",11,1
2,2021,2021,566.928668,January,"Convolution,Task analysis,Image resolution,Acc...",Many different deep networks have been used to...,Many different deep network used approximate ...,1393,"['Convolution', 'Task analysis', 'Image resolu...",11,1
3,2021,2021,566.928668,January,"Ellipsoids,Shape,Rendering ,computer graphics,...","This paper presents a precise, stable, and inv...",This paper present precise stable invertible...,914,"['Ellipsoids', 'Shape', 'Rendering ', 'compute...",12,1
4,2021,2021,566.928668,January,"Bayes methods,Principal component analysis,Ada...",Robust tensor factorization is a fundamental p...,Robust tensor factorization fundamental proble...,1300,"['Bayes methods', 'Principal component analysi...",11,1


## Keywords

In [73]:
# Build keywords dataframe
years_list = list(df['Year'].unique())
keywords_list = []
kw_count = []
kw_uniques = []
kw_uniques_count = []

# iterate through each year
for year in years_list:
    # get all keywords for the selected year
    keywords = []
    years_kws_temp = df.loc[(df['Year'] == year) & (df['Number of Keywords'] > 0)]['Keywords_Cleaned']
    for kw in years_kws_temp:
        if len(kw) == 2:
            continue
            
        # kw's format: ["'keyword_1', ..., 'keywords_xx']
        # remove apostrophe, double quotation marks and the square bracket
        kw_str = kw.replace('\'', "")[1:-1].replace('“', '').replace('”', '').replace('"', '').strip()
        keywords.extend(kw_str.split(', '))
    
    # store all keywords for selected year
    keywords_str = ', '.join(keywords)
    keywords_list.append(keywords_str)
    kw_count.append(len(keywords))
    
    # get a list of unique keywords
    unique_kw = list(set(keywords))
    unique_kw = [kw.strip() for kw in unique_kw if kw.strip()]
    kw_uniques.append(', '.join(unique_kw))
    kw_uniques_count.append(len(unique_kw))

# create dataframe of keywords per year
years_keywords = pd.DataFrame(dict({'Year': years_list, 
                                    'Keywords': keywords_list,
                                    'Unique Keywords': kw_uniques,
                                    'Number of Keywords': kw_count,
                                    'Number of Unique Keywords': kw_uniques_count}))

# sort the dataframe by year
years_keywords.sort_values(by='Year', inplace=True)

# look at data
years_keywords['Unique Keywords'] = years_keywords['Unique Keywords'].str.replace(r'(?<=\w)-', ' - ').str.replace(';', ', ').str.replace(r'[^\w\s,]', '')
years_keywords['Unique Keywords'] = years_keywords['Unique Keywords'].str.replace(r',\s*,', ',').str.replace(r'[^\w\s,]', '')
years_keywords['Keywords'] = years_keywords['Keywords'].str.replace(r'(?<=\w)-', ' - ').str.replace(';', ', ').str.replace(r'[^\w\s,]', '')
years_keywords['Keywords'] = years_keywords['Keywords'].str.replace(r',\s*,', ',').str.replace(r'[^\w\s,]', '')
years_keywords

  years_keywords['Unique Keywords'] = years_keywords['Unique Keywords'].str.replace(r'(?<=\w)-', ' - ').str.replace(';', ', ').str.replace(r'[^\w\s,]', '')
  years_keywords['Unique Keywords'] = years_keywords['Unique Keywords'].str.replace(r',\s*,', ',').str.replace(r'[^\w\s,]', '')
  years_keywords['Keywords'] = years_keywords['Keywords'].str.replace(r'(?<=\w)-', ' - ').str.replace(';', ', ').str.replace(r'[^\w\s,]', '')
  years_keywords['Keywords'] = years_keywords['Keywords'].str.replace(r',\s*,', ',').str.replace(r'[^\w\s,]', '')


Unnamed: 0,Year,Keywords,Unique Keywords,Number of Keywords,Number of Unique Keywords
8,2010,"Vocabulary, Frequency, Iterative algorithms, C...","Gaussian processes, Implicit polynomials, Comp...",2727,1379
9,2011,"Image reconstruction, Shape, Engineering drawi...","point matching, Gaussian processes, Computer e...",2385,1243
10,2012,"Biometrics , access control, Lighting, Bayesia...","Color, Quantization, scene understanding, appe...",2234,1154
11,2013,"Image reconstruction, Three dimensional displa...","point matching, Active diagnosis, Gaussian pro...",2623,1338
12,2014,"Surface morphology, Level set, Threedimensiona...","Gaussian processes, single image camera calibr...",2523,1262
6,2015,"Fasteners, FCC, Probabilistic logic, Vectors, ...","Hierarchical Conditional Random Field, Gaussia...",3023,1511
7,2016,"Shape, Trajectory, Skeleton, Threedimensional ...","Gaussian processes, homeland security, Feature...",2882,1454
4,2017,"Clustering algorithms, Clustering methods, Sem...","Gaussian processes, pdf, Mathematical model, C...",2309,1280
5,2018,"Face, Face recognition, Training, Neural netwo...",CamerasThreedimensional displaysRobustnessOpti...,2628,1391
3,2019,"Image classification, Image representation, Pa...","Gaussian processes, average consensus, anchor ...",2758,1478


In [74]:
years_keywords['Unique Keywords'][0]

'Gaussian processes, GAN, Portable document format, KernelEstimationProbability distributionIndependent component analysisDeconvolutionImage reconstructionNoise measurementLinear timedensity estimationdensity derivativeprojection pursuitindependent component analysisnonparametric regressionimage deconvolutionimage denoisingimage reconstruction, Mathematical model, imagebased localization, dense motion estimation, part location, Bit rateLossy image compressionconvolutional networksarithmetic codings, FaceThreedimensional displaysTwo dimensional displaysSolid modelingTrackingCamerasVideo stabilizationface modeling, Data visualization, energybased models, unsupervised video object segmentation, TensorsMinimizationImage color analysisPrincipal component analysisPeriodic structuresSparse matricesLinear programmingTensor singular value decompositionrobust principal component analysismultidimensional image denoising, Waste materials, Pose estimationThreedimensional displaysTwo dimensional dis

### Preprocess Keywords

In [75]:
def tokenize(text):
    return text.split(', ')

In [76]:
import nltk
nltk.download('stopwords')
# get unique English stopwords
stopwords = set(nltk.corpus.stopwords.words("english"))    

def remove_stop(tokens):
    # remove stopwords from tokens
    return [t for t in tokens if t not in stopwords]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
# create a data pipeline to perform tokenization, stopwords removal, and stemming
pipeline = [tokenize, remove_stop]

def prepare(text, pipeline):
    tokens = text
    
    for transform in pipeline:
        tokens = transform(tokens)
    return tokens

In [84]:
from nltk.stem import WordNetLemmatizer

# create a lemmatizer  object for lemmatizing
lemmatizer = WordNetLemmatizer()   

# tokenize keywords
years_keywords['Keyword Tokens'] = years_keywords['Unique Keywords'].apply(prepare, pipeline=pipeline)
years_keywords['Keyword Tokens'] = years_keywords['Keyword Tokens'].map(lambda x:
                                                                        [lemmatizer.lemmatize(token) for token in x])
years_keywords.tail()

Unnamed: 0,Year,Keywords,Unique Keywords,Number of Keywords,Number of Unique Keywords,Keyword Tokens
3,2019,"Image classification, Image representation, Pa...","Gaussian processes, average consensus, anchor ...",2758,1478,"[Gaussian processes, average consensus, anchor..."
2,2020,"Trajectory, Manifolds, Threedimensional displa...","Gaussian processes, multiple datasets, unsuper...",2813,1480,"[Gaussian processes, multiple datasets, unsupe..."
0,2021,"Motion segmentation, Computer vision, Transmis...","Gaussian processes, GAN, Portable document for...",3462,1741,"[Gaussian processes, GAN, Portable document fo..."
1,2022,"Jacobian matrices, Explosions, Measurement, Bi...","Gaussian processes, Future instance segmentati...",3315,1661,"[Gaussian processes, Future instance segmentat..."
13,2023,"Web search, Estimation, Industrial Internet of...","artificial intelligence, Hair, Object detectio...",97,75,"[artificial intelligence, Hair, Object detecti..."


### Keyword Frequency Distribution per Year

In [89]:
from collections import Counter

# a list of dataframe freq_kw
freq_kw_list = []

# get a list of years that have keywords
years_list = years_keywords[years_keywords['Number of Keywords'] > 0]['Year'].to_list()

# iterate through years and get the top count tokens
for year in years_list:
    token_kw_list = years_keywords.loc[(years_keywords['Year'] == year) & \
                                       (years_keywords['Number of Keywords'] > 0)]['Keyword Tokens'].values[0]
    counter = Counter(token_kw_list)
    
    # transform counter into a DataFrame
    freq_kw = pd.DataFrame.from_dict(counter, orient='index', columns=['Frequency'])
    freq_kw['Year'] = year
    freq_kw.index.name = 'Token'
    freq_kw.reset_index(inplace=True)
    freq_kw.sort_values(by='Frequency', ascending=False, inplace=True)
    
    freq_kw_list.append(freq_kw)

print('Length of freq_kw_list:', len(freq_kw_list))

# merge all freq_kw dataframes 
years_freq_kw = pd.concat(freq_kw_list, axis=0, ignore_index=True)
years_freq_kw.sort_values(by='Year', inplace=True)
print('Shape of dataframe:', years_freq_kw.shape)
years_freq_kw

Length of freq_kw_list: 14
Shape of dataframe: (18401, 3)


Unnamed: 0,Token,Frequency,Year
0,Markov random fields,2,2010
922,compositionality,1,2010
921,Mirrors,1,2010
920,Mathematical model,1,2010
919,halfoccluded surface region,1,2010
...,...,...,...
18348,annotated training data,1,2023
18347,AI services,1,2023
18346,MR,1,2023
18344,Surveillance,1,2023


### Total Count per Keyword

In [90]:
# get keyword counts
keyword_counts = pd.DataFrame(years_freq_kw.groupby('Token')['Frequency'].sum().sort_values(ascending=False))
keyword_counts.reset_index(inplace=True)
keyword_counts.rename(columns={'Token':'Keyword'}, inplace=True)

print(keyword_counts.shape)
keyword_counts.head()

(11404, 2)


Unnamed: 0,Keyword,Frequency
0,Biometrics,16
1,Object detection,14
2,Data models,14
3,Image segmentation,14
4,Visualization,14


### Keywords Distribution per Year

In [93]:
keywords_df_list = []       # a list of data frame of keyword distribution
keywords_timeline = []      # a list of year that keyword occurs

for token in keyword_counts['Keyword'].values:
    # get keywords distribution per year
    temp_kw_df = years_freq_kw[years_freq_kw['Token'] == token]
    keywords_df_list.append(temp_kw_df)
    keywords_timeline.append(list(temp_kw_df['Year'].values))
    
# merge all dataframes 
kw_dist = pd.concat(keywords_df_list, axis=0, ignore_index=True)
print('Data shape:', kw_dist.shape)
kw_dist.head()

Data shape: (18401, 3)


Unnamed: 0,Token,Frequency,Year
0,Biometrics,1,2010
1,Biometrics,1,2011
2,Biometrics,2,2012
3,Biometrics,1,2013
4,Biometrics,1,2014


### Keywords Timeline

In [94]:
# build timeline list for each Top 20 Keywords
keywords_timeline_str = []


for years_list in keywords_timeline:
    min_year = []      # lower bound of timeline
    max_year = []      # upper bound of timeline
    
    # build lower bound and upper bound for timeline
    for i, year in enumerate(years_list):
        if i == 0: 
            # set value for lower bound if this is the first item in years_list
            min_year.append(year)
        else:
            if len(max_year) == 0:
                if year == min_year[len(min_year) - 1] + 1:
                    # if current year equals previous min year + 1
                    # set value for upper bound
                    max_year.append(year)
                else:
                    # if current year is not an increment of previous min year (non-consecutive year)
                    max_year.append(0)      # set upper bound to zero to indicate a gap in the timeline
                    min_year.append(year)   # set value for the next sequence in the timeline (lower bound)
            elif len(min_year) > len(max_year) and year == min_year[len(min_year) - 1] + 1:
                # if there exists a lower bound but no upper bound value
                # and current year is an increment of the previous min_year
                # set upper bound value to current year
                max_year.append(year)
            elif len(min_year) > len(max_year) and year > min_year[len(min_year) - 1] + 1:
                # if there exists a lower bound but no upper bound value
                # and current year is NOT an increment of the previous min_year
                # append 0 to max_year to indicate there is a gap in year
                max_year.append(0)
                min_year.append(year)
            elif len(min_year) == len(max_year) and year == max_year[len(max_year) - 1] + 1:
                # if there exists a timeline for current year
                # update the upper bound to current year
                max_year[len(max_year) - 1] = year
            elif len(min_year) == len(max_year) and year > max_year[len(max_year) - 1] + 1:
                # if current year is not an increment of the upper bound of the current timeline
                # add year to the new lower bound timeline
                min_year.append(year)
    
    # if len of lower bound and len of upper bound are not equal
    # set the last item in upper to zero to signify the end of timeline
    if len(min_year) > len(max_year):
        max_year.append(0)
    
    # iterate through min_year
    text = ''
    for j, yr in enumerate(min_year):
        if len(text) == 0:
            text = str(yr)
        else:
            text = text + ', ' + str(yr)

        if max_year[j] > 0:
            text = text + '-' + str(max_year[j])
            
    keywords_timeline_str.append(text)


# add timeline to keyword_counts dataframe
keyword_counts['Timeline'] = keywords_timeline_str
keyword_counts.head()

Unnamed: 0,Keyword,Frequency,Timeline
0,Biometrics,16,2010-2021
1,Object detection,14,2010-2023
2,Data models,14,2010-2023
3,Image segmentation,14,2010-2023
4,Visualization,14,2010-2023


## Export Data

In [95]:
# keywords per year
years_keywords.to_csv('extracted_files/keywords_per_year.csv', index=False)

# keywords distribution per year
years_freq_kw.to_csv('extracted_files/keywords_dist_per_year.csv', index=False)

# keyword counts (all years)
keyword_counts.to_csv('extracted_files/keyword_counts.csv', index=False)