In [1]:
import warnings
warnings.filterwarnings('ignore')

import spacy
import gensim
import pandas as pd
from textblob import Word
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS

In [2]:
file = 'total.xlsx'
df = pd.read_excel(file, 'Sheet1', header=0, engine='openpyxl')
print(df.category.value_counts())
df

Networks and Communications                                                                        3417
Information Systems, Search, Information Retrieval, Database Systems, Data Mining, Data Science    3376
Artificial Intelligence, Machine Learning, Computer Vision, Natural language processing            3240
Web, Mobile and Multimedia Technologies                                                            3127
Security and Privacy                                                                               3092
Name: category, dtype: int64


Unnamed: 0,category,title,abstract
0,"Artificial Intelligence, Machine Learning, Com...",Replicated Computations Results (RCR) Report f...,“A Holistic Approach for Collaborative Workloa...
1,"Artificial Intelligence, Machine Learning, Com...",Understanding Assimilation-contrast Effects in...,"“Unbiasedness,” which is an important property..."
2,"Artificial Intelligence, Machine Learning, Com...",Seasonal-Periodic Subgraph Mining in Temporal ...,\emphSeasonal periodicity is a frequent phenom...
3,"Artificial Intelligence, Machine Learning, Com...",Pose estimation of anime/manga characters: a c...,2D articulated pose estimation is the task of ...
4,"Artificial Intelligence, Machine Learning, Com...",One shot 3D photography,3D photography is a new medium that allows vie...
...,...,...,...
16247,"Web, Mobile and Multimedia Technologies",Young people's values: identifying trends in R...,The paper focuses on changes that have taken p...
16248,"Web, Mobile and Multimedia Technologies",Effective solution for a medical tourism aggre...,This paper analyzes the current active Medical...
16249,"Web, Mobile and Multimedia Technologies",A hands-on approach to the web of things: the ...,Internet of Things (IoT) devices are becoming ...
16250,"Web, Mobile and Multimedia Technologies",Applying a tendency to be well retweeted to fa...,While a lot of useful information can be found...


In [3]:
def singularize(words):
    word = Word(words)
    word = word.lemmatize()
    return word

In [4]:
def make_bigrams(texts):
    return bigram_mod[texts]

In [5]:
def make_trigrams(texts):
    return trigram_mod[bigram_mod[texts]]

In [6]:
# nltk.download('stopwords')
ss = spacy.load('en_core_web_sm')

sp_stopwords = ss.Defaults.stop_words
gensim_stopwords = STOPWORDS
stopwords = stopwords.words('english')
stopwords.extend(sp_stopwords)
stopwords.extend(gensim_stopwords)
stop_words = list(set(stopwords))

In [7]:
# title + abstract
df['data'] = df['title'] + ' ' + df['abstract']

# lowercase
df['data'] = df['data'].str.lower()

# tokenize
df['data'] = df['data'].map(gensim.utils.simple_preprocess)

# remove punctuations
df['data'] = df['data'].apply(lambda x: [item.replace("[^a-zA-Z]", "") for item in x])

# remove too short words (less than 3 letters)
df['data'] = df['data'].apply(lambda x: [w for w in x if len(w) > 3])

texts = df['data']
texts = texts.apply(lambda x: [item for item in x if item not in stop_words])
texts = texts.apply(lambda x: [singularize(item) for item in x]) 

bigram = gensim.models.Phrases(texts, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
texts = texts.map(make_bigrams)

trigram = gensim.models.Phrases(texts, threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)
texts = texts.map(make_trigrams)

for text in texts:
    print(text)

['replicated', 'computation', 'result', 'report', 'holistic', 'approach', 'collaborative', 'workload', 'execution', 'volunteer', 'cloud', 'holistic', 'approach', 'collaborative', 'workload', 'execution', 'volunteer', 'cloud', 'proposes', 'novel', 'approach', 'task', 'scheduling', 'volunteer', 'cloud', 'volunteer', 'cloud', 'decentralized', 'cloud', 'system', 'based', 'collaborative', 'task', 'execution', 'client', 'voluntarily', 'share', 'unused', 'computational', 'resource', 'simulation', 'based', 'statistical', 'analysis', 'technique', 'particular', 'statistical', 'model', 'checking', 'author', 'approach', 'outperform', 'existing', 'distributed', 'task', 'scheduling', 'algorithm', 'case', 'computation', 'intensive', 'workload', 'analysis', 'considered', 'realistic', 'workload', 'benchmark', 'provided', 'google', 'replicated', 'computation', 'result', 'report', 'focus', 'prototypical', 'tool', 'implementation', 'article', 'perform', 'analysis', 'software', 'straightforward', 'install'

['indoor', 'quality', 'position', 'visual', 'assessment', 'crowdsourced', 'fingerprint', 'map', 'internet', 'based', 'indoor_navigation', 'architecture', 'organize', 'signal', 'collected', 'crowdsourcers', 'fingerprint', 'map', 'improve', 'localization', 'given', 'satellite', 'based', 'technology', 'operate', 'accurately', 'indoor', 'space', 'people', 'spend', 'time', 'article', 'study', 'quality', 'position', 'assessment', 'problem', 'aim', 'ass', 'offline', 'manner', 'localization', 'accuracy', 'obtained', 'user', 'aim', 'localize', 'particularly', 'proposed', 'acces', 'framework', 'us', 'generic', 'interpolation', 'method', 'gaussian', 'process', 'navigability', 'score', 'location', 'derived', 'cramer', 'lower_bound', 'crlb', 'derive', 'adaptation', 'acces', 'magnetic', 'data', 'implement', 'complete', 'visual', 'assessment', 'environment', 'incorporated', 'anyplace', 'open', 'source', 'experimental', 'evaluation', 'acces', 'anyplace', 'suggests', 'high', 'qualitative_quantitative',

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [34]:
clean = []
for text in texts:
    sentence = ' '.join(text)
    clean.append(sentence)

In [36]:
new_df = pd.DataFrame(columns=['short', 'category', 'content'])

new_df['category'] = df.category
new_df['content'] = clean
new_df['short'] = df.category.str.replace('Artificial Intelligence, Machine Learning, Computer Vision, Natural language processing', 'ai')
new_df['short'] = new_df.short.str.replace('Information Systems, Search, Information Retrieval, Database Systems, Data Mining, Data Science', 'is')
new_df['short'] = new_df.short.str.replace('Networks and Communications', 'network')
new_df['short'] = new_df.short.str.replace('Security and Privacy', 'security')
new_df['short'] = new_df.short.str.replace('Web, Mobile and Multimedia Technologies', 'mobile')

new_df.to_excel('LSTM.xlsx', index=False)