# Data Preprocessing

This notebook builds a master dataframe of financial journals, clean the data, and export data to csv files.

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
cd drive/MyDrive/ML_Trending_Topics/

/content/drive/MyDrive/ML_Trending_Topics


In [3]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import StandardScaler
from nltk.stem import WordNetLemmatizer
import regex as re
import numpy as np
import pandas as pd
import warnings
import glob
warnings.filterwarnings('ignore')

## Load Dataset

### Improved

In [4]:
# Get data file names
path = 'Dataset'
filenames = glob.glob(path + "/ieee*.csv")

dfs = []
for filename in filenames:
    dfs.append(pd.read_csv(filename, error_bad_lines=False))

# Concatenate all data into one DataFrame
big_frame_csv = pd.concat(dfs, ignore_index=True)

b'Skipping line 4: expected 1 fields, saw 6\nSkipping line 5: expected 1 fields, saw 6\nSkipping line 6: expected 1 fields, saw 6\nSkipping line 7: expected 1 fields, saw 6\nSkipping line 8: expected 1 fields, saw 6\nSkipping line 9: expected 1 fields, saw 6\nSkipping line 10: expected 1 fields, saw 6\nSkipping line 11: expected 1 fields, saw 6\nSkipping line 12: expected 1 fields, saw 6\nSkipping line 13: expected 1 fields, saw 6\nSkipping line 14: expected 1 fields, saw 6\nSkipping line 15: expected 1 fields, saw 6\nSkipping line 16: expected 1 fields, saw 6\nSkipping line 17: expected 1 fields, saw 6\nSkipping line 18: expected 1 fields, saw 6\nSkipping line 19: expected 1 fields, saw 6\nSkipping line 20: expected 1 fields, saw 6\nSkipping line 21: expected 1 fields, saw 6\nSkipping line 22: expected 1 fields, saw 6\nSkipping line 23: expected 1 fields, saw 6\nSkipping line 24: expected 1 fields, saw 6\nSkipping line 25: expected 1 fields, saw 6\nSkipping line 26: expected 1 fields,

In [5]:
# Get data file names
path = 'Dataset'
filenames = glob.glob(path + "/ieee*.xlsx")

dfs = []
for filename in filenames:
    dfs.append(pd.read_excel(filename))

# Concatenate all data into one DataFrame
big_frame_xl = pd.concat(dfs, ignore_index=True)

In [6]:
filenames

['Dataset/ieeexplore-2023.xlsx',
 'Dataset/ieeexplore2010-2014.xlsx',
 'Dataset/ieee2010-2022-2.csv.xlsx',
 'Dataset/ieeexplore2010-2022.xlsx',
 'Dataset/ieee2015-2022-6.xlsx',
 'Dataset/ieee2015-2022-5.xlsx']

In [7]:
big_frame_xl['year'].unique()

array([2023., 2022., 2010., 2011., 2012., 2013., 2014., 2021., 2020.,
       2019., 2017., 2018., 2015., 2016.])

In [8]:
big_frame_xl['year'] = big_frame_xl['year'].astype(int)
big_frame_xl

Unnamed: 0,Authors,Title,year,month,Abstract,Keywords,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Chen Zhang; Haodi Zhang; Qifan Li; Kaishun Wu;...,Burstiness-Aware Web Search Analysis on Differ...,2023,March,Personalizing the analysis for web search pote...,"Web search , Biological system modeling , Comp...",,,,,
1,Hongjie Cai; Yue Gao; Manhua Liu,Graph Transformer Geometric Learning of Brain ...,2023,February,Brain age is considered as an important biomar...,"Estimation , Diffusion tensor imaging , Transf...",,,,,
2,Premkumar Chithaluru; Fadi AL-Turjman; Manoj K...,Computational Intelligence Inspired Adaptive O...,2023,January,The major issues and challenges of the Industr...,"Industrial Internet of Things , Peer-to-peer c...",,,,,
3,Mads Olsen; Jamie M. Zeitzer; Risa N. Richards...,A Flexible Deep Learning Architecture for Temp...,2023,January,Wrist-worn consumer sleep technologies (CST) t...,"Recording , Sleep apnea , Deep learning , Deco...",,,,,
4,Jeongseok Hyun; Myunggu Kang; Dongyoon Wee; Di...,Detection Recovery in Online Multi-Object Trac...,2023,February,In existing joint detection and tracking metho...,"Computer vision , Target tracking , Codes , Im...",,,,,
...,...,...,...,...,...,...,...,...,...,...,...
10199,"Henter GE,Kleijn WB",Minimum Entropy Rate Simplification of Stochas...,2016,December,We propose minimum entropy rate simplification...,Markov processes;Stochastic processes;Gaussian...,,,,,
10200,"Wang T,Gong S,Zhu X,Wang S",Person Re-Identification by Discriminative Sel...,2016,December,Current person re-identification (ReID) method...,Image sequences;Cameras;Gait recognition;Visua...,,,,,
10201,"Türetken E,Benmansour F,Andres B,Głowacki P,Pf...",Reconstructing Curvilinear Networks Using Path...,2016,December,We propose a novel approach to automated delin...,Linear programming;Image reconstruction;Curvil...,,,,,
10202,"Venkataraman V,Turaga P",Shape Distributions of Nonlinear Dynamical Sys...,2016,December,This paper presents a shape-theoretic framewor...,Chaos theory;Hidden Markov models;Analytical m...,,,,,


In [9]:
big_frame_csv.drop(columns = ['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', '# CSV-File created with merge-csv.com'], inplace=True)

In [10]:
big_frame_csv

Unnamed: 0,Authors,Title,year,month,Abstract,Keywords
0,"Xu X,Cheong LF,Li Z",3D Rigid Motion Segmentation with Mixed and Un...,2021.0,January,Many real-world video sequences cannot be conv...,"Motion segmentation,Computer vision,Transmissi..."
1,"Johnson R,Zhang T",A Framework of Composite Functional Gradient M...,2021.0,January,Generative adversarial networks (GAN) are trai...,"Generative adversarial networks,Generators,Gal..."
2,"Fan Q,Chen D,Yuan L,Hua G,Yu N,Chen B",A General Decoupled Learning Framework for Par...,2021.0,January,Many different deep networks have been used to...,"Convolution,Task analysis,Image resolution,Acc..."
3,"Chen L,Zheng Y,Shi B,Subpa-asa A,Sato I",A Microfacet-Based Model for Photometric Stere...,2021.0,January,"This paper presents a precise, stable, and inv...","Ellipsoids,Shape,Rendering (computer graphics)..."
4,"Zhou Y,Cheung YM",Bayesian Low-Tubal-Rank Robust Tensor Factoriz...,2021.0,January,Robust tensor factorization is a fundamental p...,"Bayes methods,Principal component analysis,Ada..."
...,...,...,...,...,...,...
10101,"Wang T,Gong S,Zhu X,Wang S",Person Re-Identification by Discriminative Sel...,2016.0,December,Current person re-identification (ReID) method...,"Image sequences,Cameras,Gait recognition,Visua..."
10102,"Türetken E,Benmansour F,Andres B,Głowacki P,Pf...",Reconstructing Curvilinear Networks Using Path...,2016.0,December,We propose a novel approach to automated delin...,"Linear programming,Image reconstruction,Curvil..."
10103,"Venkataraman V,Turaga P",Shape Distributions of Nonlinear Dynamical Sys...,2016.0,December,This paper presents a shape-theoretic framewor...,"Chaos theory,Hidden Markov models,Analytical m..."
10104,"Leifman G,Shtrom E,Tal A",Surface Regions of Interest for Viewpoint Sele...,2016.0,December,While the detection of the interesting regions...,"Three-dimensional displays,Surface treatment,H..."


In [11]:
big_frame_xl.drop(columns = ['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10'], inplace=True)

In [12]:
list_df = pd.concat([big_frame_csv, big_frame_xl], ignore_index=True)

In [13]:
list_df.drop_duplicates(inplace=True)

### Original

In [14]:
'''# get a list of csv file in folder Dataset
dataset_p = 'Dataset/ieee2015-2022-6.csv'

# read each csv file into a Pandas dataframe
# and store the dataframes into a list
list_df = pd.read_csv(dataset_p, encoding="ISO-8859-1") 

# display number of dataframes created
# print("Number of Data Frames:", len(list_df))

# look at the first dataframe
list_df'''

'# get a list of csv file in folder Dataset\ndataset_p = \'Dataset/ieee2015-2022-6.csv\'\n\n# read each csv file into a Pandas dataframe\n# and store the dataframes into a list\nlist_df = pd.read_csv(dataset_p, encoding="ISO-8859-1") \n\n# display number of dataframes created\n# print("Number of Data Frames:", len(list_df))\n\n# look at the first dataframe\nlist_df'

In [15]:
list_df.head()

Unnamed: 0,Authors,Title,year,month,Abstract,Keywords
0,"Xu X,Cheong LF,Li Z",3D Rigid Motion Segmentation with Mixed and Un...,2021.0,January,Many real-world video sequences cannot be conv...,"Motion segmentation,Computer vision,Transmissi..."
1,"Johnson R,Zhang T",A Framework of Composite Functional Gradient M...,2021.0,January,Generative adversarial networks (GAN) are trai...,"Generative adversarial networks,Generators,Gal..."
2,"Fan Q,Chen D,Yuan L,Hua G,Yu N,Chen B",A General Decoupled Learning Framework for Par...,2021.0,January,Many different deep networks have been used to...,"Convolution,Task analysis,Image resolution,Acc..."
3,"Chen L,Zheng Y,Shi B,Subpa-asa A,Sato I",A Microfacet-Based Model for Photometric Stere...,2021.0,January,"This paper presents a precise, stable, and inv...","Ellipsoids,Shape,Rendering (computer graphics)..."
4,"Zhou Y,Cheung YM",Bayesian Low-Tubal-Rank Robust Tensor Factoriz...,2021.0,January,Robust tensor factorization is a fundamental p...,"Bayes methods,Principal component analysis,Ada..."


In [16]:
# list_df.drop('Unnamed: 10', axis=1, inplace=True)

In [17]:
list_df

Unnamed: 0,Authors,Title,year,month,Abstract,Keywords
0,"Xu X,Cheong LF,Li Z",3D Rigid Motion Segmentation with Mixed and Un...,2021.0,January,Many real-world video sequences cannot be conv...,"Motion segmentation,Computer vision,Transmissi..."
1,"Johnson R,Zhang T",A Framework of Composite Functional Gradient M...,2021.0,January,Generative adversarial networks (GAN) are trai...,"Generative adversarial networks,Generators,Gal..."
2,"Fan Q,Chen D,Yuan L,Hua G,Yu N,Chen B",A General Decoupled Learning Framework for Par...,2021.0,January,Many different deep networks have been used to...,"Convolution,Task analysis,Image resolution,Acc..."
3,"Chen L,Zheng Y,Shi B,Subpa-asa A,Sato I",A Microfacet-Based Model for Photometric Stere...,2021.0,January,"This paper presents a precise, stable, and inv...","Ellipsoids,Shape,Rendering (computer graphics)..."
4,"Zhou Y,Cheung YM",Bayesian Low-Tubal-Rank Robust Tensor Factoriz...,2021.0,January,Robust tensor factorization is a fundamental p...,"Bayes methods,Principal component analysis,Ada..."
...,...,...,...,...,...,...
20305,"Henter GE,Kleijn WB",Minimum Entropy Rate Simplification of Stochas...,2016.0,December,We propose minimum entropy rate simplification...,Markov processes;Stochastic processes;Gaussian...
20306,"Wang T,Gong S,Zhu X,Wang S",Person Re-Identification by Discriminative Sel...,2016.0,December,Current person re-identification (ReID) method...,Image sequences;Cameras;Gait recognition;Visua...
20307,"Türetken E,Benmansour F,Andres B,Głowacki P,Pf...",Reconstructing Curvilinear Networks Using Path...,2016.0,December,We propose a novel approach to automated delin...,Linear programming;Image reconstruction;Curvil...
20308,"Venkataraman V,Turaga P",Shape Distributions of Nonlinear Dynamical Sys...,2016.0,December,This paper presents a shape-theoretic framewor...,Chaos theory;Hidden Markov models;Analytical m...


In [18]:
list_df['year_con'] = pd.to_numeric(list_df.year,errors='coerce')

In [19]:
list_df['year_con'].std()


3.5765036792015263

In [20]:
# merge all dataframes into a master dataframe
df_master = list_df

# drop rows that have missing values
df_master.dropna(axis=0, how='any',inplace=True)

# get the name of first author
df_master['author_first'] = df_master.Authors.apply(lambda x: x[1:-1].split(',')[0][1:-1])

# duplicate Year
df_master['year_master'] = df_master['year_con'].copy()

# compute the standard deviation for Year
df_master['year_std'] = df_master['year_con'] / df_master['year_con'].std()

# apply standard scaler on Year
df_master['year_con'] = StandardScaler().fit_transform(df_master['year_con'].values.reshape(-1, 1))

# display the shape of df_masters
print("Data's shape: ", df_master.shape)

# look at df_master
df_master.head()

Data's shape:  (4604, 10)


Unnamed: 0,Authors,Title,year,month,Abstract,Keywords,year_con,author_first,year_master,year_std
0,"Xu X,Cheong LF,Li Z",3D Rigid Motion Segmentation with Mixed and Un...,2021.0,January,Many real-world video sequences cannot be conv...,"Motion segmentation,Computer vision,Transmissi...",0.999594,,2021.0,566.928668
1,"Johnson R,Zhang T",A Framework of Composite Functional Gradient M...,2021.0,January,Generative adversarial networks (GAN) are trai...,"Generative adversarial networks,Generators,Gal...",0.999594,hnson,2021.0,566.928668
2,"Fan Q,Chen D,Yuan L,Hua G,Yu N,Chen B",A General Decoupled Learning Framework for Par...,2021.0,January,Many different deep networks have been used to...,"Convolution,Task analysis,Image resolution,Acc...",0.999594,n,2021.0,566.928668
3,"Chen L,Zheng Y,Shi B,Subpa-asa A,Sato I",A Microfacet-Based Model for Photometric Stere...,2021.0,January,"This paper presents a precise, stable, and inv...","Ellipsoids,Shape,Rendering (computer graphics)...",0.999594,en,2021.0,566.928668
4,"Zhou Y,Cheung YM",Bayesian Low-Tubal-Rank Robust Tensor Factoriz...,2021.0,January,Robust tensor factorization is a fundamental p...,"Bayes methods,Principal component analysis,Ada...",0.999594,ou,2021.0,566.928668


## Data Preprocessing & Cleaning

#### Original

In [21]:
'''import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))  
p = re.compile(r'[^\w\s]+')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
df_master['t'] = df_master.Abstract.map(lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
df_master['t'] = df_master.t.map(lambda x: ' '.join([y for y in x.split(' ') if not y in stops]))
df_master['t'] = df_master.t.map(lambda x: ' '.join([p.sub('', x) for x in x.split(' ')]))
df_master.head()'''

'import nltk\nnltk.download(\'stopwords\')\nfrom nltk.corpus import stopwords\nstops = set(stopwords.words("english"))  \np = re.compile(r\'[^\\w\\s]+\')\nlemmatizer = WordNetLemmatizer()\nstemmer = SnowballStemmer("english")\ndf_master[\'t\'] = df_master.Abstract.map(lambda x: \' \'.join([stemmer.stem(y) for y in x.split(\' \')]))\ndf_master[\'t\'] = df_master.t.map(lambda x: \' \'.join([y for y in x.split(\' \') if not y in stops]))\ndf_master[\'t\'] = df_master.t.map(lambda x: \' \'.join([p.sub(\'\', x) for x in x.split(\' \')]))\ndf_master.head()'

#### Improved

In [22]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))  
p = re.compile(r'[^\w\s]+')
lemmatizer = WordNetLemmatizer()
df_master['t'] = df_master.Abstract.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))
df_master['t'] = df_master.t.map(lambda x: ' '.join([y for y in x.split(' ') if not y in stops]))
df_master['t'] = df_master.t.map(lambda x: ' '.join([p.sub(' ', x) for x in x.split(' ')]))
df_master.head()

Unnamed: 0,Authors,Title,year,month,Abstract,Keywords,year_con,author_first,year_master,year_std,t
0,"Xu X,Cheong LF,Li Z",3D Rigid Motion Segmentation with Mixed and Un...,2021.0,January,Many real-world video sequences cannot be conv...,"Motion segmentation,Computer vision,Transmissi...",0.999594,,2021.0,566.928668,Many real world video sequence cannot convenie...
1,"Johnson R,Zhang T",A Framework of Composite Functional Gradient M...,2021.0,January,Generative adversarial networks (GAN) are trai...,"Generative adversarial networks,Generators,Gal...",0.999594,hnson,2021.0,566.928668,Generative adversarial network GAN trained m...
2,"Fan Q,Chen D,Yuan L,Hua G,Yu N,Chen B",A General Decoupled Learning Framework for Par...,2021.0,January,Many different deep networks have been used to...,"Convolution,Task analysis,Image resolution,Acc...",0.999594,n,2021.0,566.928668,Many different deep network used approximate ...
3,"Chen L,Zheng Y,Shi B,Subpa-asa A,Sato I",A Microfacet-Based Model for Photometric Stere...,2021.0,January,"This paper presents a precise, stable, and inv...","Ellipsoids,Shape,Rendering (computer graphics)...",0.999594,en,2021.0,566.928668,This paper present precise stable invertible...
4,"Zhou Y,Cheung YM",Bayesian Low-Tubal-Rank Robust Tensor Factoriz...,2021.0,January,Robust tensor factorization is a fundamental p...,"Bayes methods,Principal component analysis,Ada...",0.999594,ou,2021.0,566.928668,Robust tensor factorization fundamental proble...


#### Original

In [24]:
'''import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

stops = set(stopwords.words("english"))    # get unique English stopwords
regex = re.compile(r'[^\w\s]+')            # regular expression pattern
stemmer = SnowballStemmer("english")       # create a Snowball Stemmer object for stemming

  
lemmatizer = WordNetLemmatizer()
  
# duplicate a subset of the master dataframe
df = df_master[['year_master', 'year', 'year_std','month', 'Keywords', 'Abstract']]
df.rename(columns={'year_master': 'Year', 'year': 'Year_Scaled', 
                   'year_std': 'Year_STD','month': 'Month',  'Keywords': 'Keywords',
                   'abstract': 'Abstract'}, inplace=True)

# clean the abstract
df['Abstract_Cleaned'] = df.Abstract.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))
df['Abstract_Cleaned'] = df.Abstract_Cleaned.map(lambda x: ' '.join([y for y in x.split(' ') if not y in stops]))
# df['Abstract_Cleaned'] = df.Abstract_Cleaned.map(lambda x: ' '.join([regex.sub('', x) for x in x.split(' ')]))

# get the length of the abstract
df['Abstract Length'] = df.Abstract.str.len()'''

'import nltk\nnltk.download(\'stopwords\')\nfrom nltk.corpus import stopwords\nimport re\nimport nltk\nnltk.download(\'wordnet\')\nnltk.download(\'omw-1.4\')\n\nstops = set(stopwords.words("english"))    # get unique English stopwords\nregex = re.compile(r\'[^\\w\\s]+\')            # regular expression pattern\nstemmer = SnowballStemmer("english")       # create a Snowball Stemmer object for stemming\n\n  \nlemmatizer = WordNetLemmatizer()\n  \n# duplicate a subset of the master dataframe\ndf = df_master[[\'year_master\', \'year\', \'year_std\',\'month\', \'Keywords\', \'Abstract\']]\ndf.rename(columns={\'year_master\': \'Year\', \'year\': \'Year_Scaled\', \n                   \'year_std\': \'Year_STD\',\'month\': \'Month\',  \'Keywords\': \'Keywords\',\n                   \'abstract\': \'Abstract\'}, inplace=True)\n\n# clean the abstract\ndf[\'Abstract_Cleaned\'] = df.Abstract.map(lambda x: \' \'.join([lemmatizer.lemmatize(y) for y in x.split(\' \')]))\ndf[\'Abstract_Cleaned\'] = df.A

#### Improved

In [25]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

stops = set(stopwords.words("english"))    # get unique English stopwords
regex = re.compile(r'[^\w\s]+')            # regular expression pattern
stemmer = SnowballStemmer("english")       # create a Snowball Stemmer object for stemming

  
lemmatizer = WordNetLemmatizer()
  
# duplicate a subset of the master dataframe
df = df_master[['year_master', 'year', 'year_std','month', 'Keywords', 'Abstract']]
df.rename(columns={'year_master': 'Year', 'year': 'Year_Scaled', 
                   'year_std': 'Year_STD','month': 'Month',  'Keywords': 'Keywords',
                   'abstract': 'Abstract'}, inplace=True)

# clean the abstract
df['Abstract_Cleaned'] = df.Abstract.map(lambda x: ' '.join([lemmatizer.lemmatize(y) for y in x.split(' ')]))
df['Abstract_Cleaned'] = df.Abstract_Cleaned.map(lambda x: ' '.join([y for y in x.split(' ') if not y in stops]))
df['Abstract_Cleaned'] = df.Abstract_Cleaned.map(lambda x: ' '.join([regex.sub('', x) for x in x.split(' ')]))
df['Keywords'] = df['Keywords'].replace('[()]', ',', regex=True)
df['Keywords'] = df['Keywords'].replace('\$360\^\\\\circ\s360\sâ\\x88\\x98', ',', regex=True)
# get the length of the abstract
df['Abstract Length'] = df.Abstract.str.len()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [26]:
df

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length
0,2021.0,2021.0,566.928668,January,"Motion segmentation,Computer vision,Transmissi...",Many real-world video sequences cannot be conv...,Many realworld video sequence cannot convenien...,1594
1,2021.0,2021.0,566.928668,January,"Generative adversarial networks,Generators,Gal...",Generative adversarial networks (GAN) are trai...,Generative adversarial network GAN trained min...,955
2,2021.0,2021.0,566.928668,January,"Convolution,Task analysis,Image resolution,Acc...",Many different deep networks have been used to...,Many different deep network used approximate a...,1393
3,2021.0,2021.0,566.928668,January,"Ellipsoids,Shape,Rendering ,computer graphics,...","This paper presents a precise, stable, and inv...",This paper present precise stable invertible r...,914
4,2021.0,2021.0,566.928668,January,"Bayes methods,Principal component analysis,Ada...",Robust tensor factorization is a fundamental p...,Robust tensor factorization fundamental proble...,1300
...,...,...,...,...,...,...,...,...
20305,2016.0,2016.0,565.526073,December,Markov processes;Stochastic processes;Gaussian...,We propose minimum entropy rate simplification...,We propose minimum entropy rate simplification...,1298
20306,2016.0,2016.0,565.526073,December,Image sequences;Cameras;Gait recognition;Visua...,Current person re-identification (ReID) method...,Current person reidentification ReID method ty...,1095
20307,2016.0,2016.0,565.526073,December,Linear programming;Image reconstruction;Curvil...,We propose a novel approach to automated delin...,We propose novel approach automated delineatio...,951
20308,2016.0,2016.0,565.526073,December,Chaos theory;Hidden Markov models;Analytical m...,This paper presents a shape-theoretic framewor...,This paper present shapetheoretic framework dy...,1701


In [27]:
df['Abstract_Cleaned']=df_master['t']

#### Original

In [28]:
'''def stem_keywords(keywords):
    # create an empty list of keywords
    kw_list = []
    
    # iterate through each keyword in the list and apply stemming
    for kw in keywords[1:-1].split(', '):
        # stem each word in the  keyword
        stemmed_kw = ' '.join(lemmatizer.lemmatize(word) for word in kw.split(' '))
        kw_list.append(stemmed_kw)
    
    return kw_list


# apply stemming to Keywords
df['Keywords_Cleaned'] = df['Keywords'].apply(stem_keywords)'''

"def stem_keywords(keywords):\n    # create an empty list of keywords\n    kw_list = []\n    \n    # iterate through each keyword in the list and apply stemming\n    for kw in keywords[1:-1].split(', '):\n        # stem each word in the  keyword\n        stemmed_kw = ' '.join(lemmatizer.lemmatize(word) for word in kw.split(' '))\n        kw_list.append(stemmed_kw)\n    \n    return kw_list\n\n\n# apply stemming to Keywords\ndf['Keywords_Cleaned'] = df['Keywords'].apply(stem_keywords)"

#### Improved

In [33]:
def stem_keywords(keywords):
    # create an empty list of keywords
    kw_list = []
    
    # iterate through each keyword in the list and apply stemming
    for kw in keywords[0:].split(', '):
        # stem each word in the  keyword
        stemmed_kw = ' '.join(lemmatizer.lemmatize(word) for word in kw.split(' '))
        kw_list.append(kw)
    
    return kw_list


# apply stemming to Keywords
df['Keywords_Cleaned'] = df['Keywords'].apply(stem_keywords)
df['Keywords_Cleaned'] = df['Keywords_Cleaned'].str.replace(r'(?<=\w)-', ' - ').str.replace(';', ', ').str.replace(r'[^\w\s,]', '')
df['Keywords_Cleaned']= df['Keywords_Cleaned'].str.replace(r',\s*,', ',').str.replace(r'[^\w\s,]', '')

#### Improved - Dataframe index and number of keywords were inconsistent

In [34]:
df = df.reset_index(drop=True)

In [35]:
for i in range(0,len(df)):
  df['Keywords_Cleaned'][i] = df['Keywords_Cleaned'][i][0].split(',')

In [36]:
len(df['Keywords_Cleaned'][0])

11

In [37]:
len(df['Keywords_Cleaned'][0])

11

In [38]:
# count number of keywords
df['Number of Keywords'] = df['Keywords_Cleaned'].map(lambda keywords: \
                                                      0 if str(keywords) == "['']" else len(keywords))

In [39]:
# def stem_authors(authors):
#     # create an empty list of keywords
#     kw_list = []
    
#     # iterate through each keyword in the list and apply stemming
#     for kw in authors[1:-1].split(', '):
#         # stem each word in the  keyword
#         stemmed_kw = ' '.join(kw.split(' '))
#         kw_list.append(stemmed_kw)
    
#     return kw_list


# # apply stemming to Keywords
# df['Authors'] = df['Authors'].apply(stem_authors)

### Month

In [40]:
df['Month'] = df['Month'].replace('â\x80\x93', '-', regex=True)

In [41]:
# display unique value of month
df['Month'].unique()

array(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December'],
      dtype=object)

In [42]:
import calendar

def process_month(month):
    '''
    - Convert month name to numeric value
    - Set any invalid month value to -1
    '''
    
    # extract month from a string
    month_extracted = month.split('-')[-1]
    
    # assume month's value is invalid
    month_numeric = -1
    
    if month_extracted.isnumeric() and int(month_extracted) in np.arange(1, 13):
        # month_extracted has value between 1 and 12
        month_numeric = int(month_extracted)
    elif not month_extracted.isnumeric():
        # month_extracted have text value "January" to "December"
        month_numeric = list(calendar.month_name).index(month_extracted)
        
    return month_numeric

In [43]:
# clean month value
df['Month_Cleaned'] = df['Month'].map(process_month)

# take a look at data
df['Year'] = df['Year'].astype(int)
df['Year_Scaled'] = df['Year_Scaled'].astype(int)
df.head()

Unnamed: 0,Year,Year_Scaled,Year_STD,Month,Keywords,Abstract,Abstract_Cleaned,Abstract Length,Keywords_Cleaned,Number of Keywords,Month_Cleaned
0,2021,2021,566.928668,January,"Motion segmentation,Computer vision,Transmissi...",Many real-world video sequences cannot be conv...,Many real world video sequence cannot convenie...,1594,"[Motion segmentation, Computer vision, Transmi...",11,1
1,2021,2021,566.928668,January,"Generative adversarial networks,Generators,Gal...",Generative adversarial networks (GAN) are trai...,Generative adversarial network GAN trained m...,955,"[Generative adversarial networks, Generators, ...",11,1
2,2021,2021,566.928668,January,"Convolution,Task analysis,Image resolution,Acc...",Many different deep networks have been used to...,Many different deep network used approximate ...,1393,"[Convolution, Task analysis, Image resolution,...",11,1
3,2021,2021,566.928668,January,"Ellipsoids,Shape,Rendering ,computer graphics,...","This paper presents a precise, stable, and inv...",This paper present precise stable invertible...,914,"[Ellipsoids, Shape, Rendering , computer graph...",12,1
4,2021,2021,566.928668,January,"Bayes methods,Principal component analysis,Ada...",Robust tensor factorization is a fundamental p...,Robust tensor factorization fundamental proble...,1300,"[Bayes methods, Principal component analysis, ...",11,1


## Export Data

In [44]:
df['Keywords_Cleaned'][0]

['Motion segmentation',
 'Computer vision',
 'Transmission line matrix methods',
 'Threedimensional displays',
 'Adaptation models',
 'Solid modeling',
 'Data models',
 'Spectral clustering',
 'model selection',
 'motion segmentation',
 'multiview learning']

In [45]:
#df_master.to_csv('extracted_files/data_master.csv', index=False)               # save master data
#df.to_csv('extracted_files/data_cleaned.csv', index=False)                    # save cleaned data

In [46]:
df_master.to_csv('extracted_files/data_master.csv', index=False)               # save master data
df.to_csv('extracted_files/data_cleaned.csv', index=False)                    # save cleaned data