# Scopus

In [1]:
import pandas as pd

scopus_1 = pd.read_csv('raw_data/scopus_1.csv')
scopus_2 = pd.read_csv('raw_data/scopus_2.csv')

scopus_tot = pd.concat((scopus_1, scopus_2))
len(scopus_tot)

4000

In [2]:
scopus_tot.drop_duplicates(inplace=True)
len(scopus_tot)

2987

In [3]:
scopus_tot['Title'].iloc[0]

'Secure User Authentication Leveraging Keystroke Dynamics via Wi-Fi Sensing'

In [4]:
scopus_tot['References'].iloc[0]

'Hanamsagar, A., Woo, S.S., Kanich, C., Mirkovic, J., Leveraging semantic transformation to investigate password habits and their causes (2018) Proc. Chi Conf. Hum. Factors Comput. Syst, , Art 570; Data-breach (2020) Data-breach, , https://www.idagent.com/blog/10-facts-About-passwords-That-you-need-To-see-now/; Gu, Y., Zhan, J., Ji, Y., Li, J., Ren, F., Gao, S., MoSense: An RF-based motion detection system via off-The-shelf Wi-Fi devices Ieee Internet Things J, 4 (6), pp. 2326-2341. , Dec. 2017; Gu, Y., Zhang, X., Liu, Z., Ren, F., BeSense: LeveragingWiFi channel data and computational intelligence for behavior analysis Ieee Comput. Intell. Mag, 14 (4), pp. 31-41. , Nov. 2019; Liu, X., Cao, J., Tang, S., Wen, J., Guo, P., Contactless respiration monitoring via off-The-shelfWiFi devices (2016) Ieee Trans. Mobile Comput, 15 (10), pp. 2466-2479. , Oct; Roth, J., Liu, X., Metaxas, D., On continuous user authentication via typing behavior (2014) Ieee Trans. Image Process, 23 (10), pp. 4611-

In [5]:
scopus_tot.columns

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'DOI', 'Link', 'Abstract', 'Author Keywords', 'Index Keywords',
       'References', 'Document Type', 'Publication Stage', 'Open Access',
       'Source', 'EID'],
      dtype='object')

In [6]:
scopus_rel_columns = ['Title', 'Year', 'Cited by', 'Abstract', 'Author Keywords', 'Authors', 'Document Type']
scopus_tot = scopus_tot[scopus_rel_columns]

# Wos

In [7]:
wos_1 = pd.read_excel('raw_data/wos_1.xls')
wos_2 = pd.read_excel('raw_data/wos_2.xls')

wos_tot = pd.concat((wos_1, wos_2))
len(wos_tot)

2000

In [8]:
wos_tot.drop_duplicates(inplace=True)
len(wos_tot)

1885

In [9]:
wos_tot.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Text',
       'Cited References', 'Cited Reference Count', 'Times Cited, WoS Core',
       'Times Cited, All Databases', '180 Day Usage Count',
       'Since 2013 Usage Count', 'Publisher', 'Publisher City',
       'Publisher Address', 'ISSN', 'eISSN', 'ISBN', 'Journal Abbreviation',
       'Journal ISO Abbreviation', 'Publication Date', 'Publication Year',
       'Volume', 'Issue', 'Part Number', 'Supplement', 

In [10]:
wos_rel_columns = ['Article Title', 'Author Keywords', 'Abstract', 'Times Cited, All Databases', 'Publication Year', 'Authors', 'Document Type']
wos_tot = wos_tot[wos_rel_columns]

In [11]:
wos_tot.iloc[0]['Authors'], scopus_tot.iloc[0]['Authors']

('Mukherjee, S; Dalmia, S; Mutnury, B; Swaminathan, M',
 'Gu Y., Wang Y., Wang M., Pan Z., Hu Z., Liu Z., Shi F., Dong M.')

In [12]:
# convert authors name as in Scopus
def conv_auth_names(x):
    if type(x) != str:
        return x
    authors = x.split(';')
    res = ''
    for i, el in enumerate(authors):
        # horrible, but we must avoid names without initials
        if ',' not in el:
            el += ','
        surn, inits = el.split(',')
        inits = inits[1:]   # remove initial space
        new_inits = ''.join([c + '.' for c in inits])
        res += f'{surn} {new_inits}'
        if i != len(authors) - 1:
            res += ','  # additional space not needed 
    return res

wos_tot['Authors'] = wos_tot['Authors'].map(lambda x: conv_auth_names(x))

In [13]:
# scopus rel columns ['Title', 'Year', 'Cited by', 'Abstract', 'Author Keywords', 'Authors', 'Document Type']
wos_tot.rename(columns={'Article Title':'Title', 'Times Cited, All Databases':'Cited by', 'Publication Year':'Year'}, inplace=True)

wos_tot = wos_tot[scopus_rel_columns]
wos_tot.head()

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type
0,Layout-level synthesis of RF bandpass filter o...,2004.0,1,A fast and accurate layout-level synthesis tec...,,"Mukherjee S., Dalmia S., Mutnury B., Swaminath...",Proceedings Paper
1,Architectural issues in multi-channel informat...,2004.0,0,"Using mobile phones, palms, and other fancy wi...",multi-channel; adaptive; quality of services,"Adorni M., Arcelli F., Raibulet C., Tisato F.",Proceedings Paper
2,E-activities via wireless network: A case at c...,2004.0,0,"Reasonably, the rapid growth of telecommunicat...",e-activities; wireless network,Sirirangsi P.,Proceedings Paper
3,WIMAX between reality and fiction,2004.0,0,Broadband wireless access has assumed an incre...,,Esmat B.,Proceedings Paper
4,An airport network for mobiles surveillance,2004.0,1,Due to the continuous increase of airport traf...,wi-fi; TETRA; geographical information systems...,"Casaca A., Presutto F., Rebelo I., Pestana G.,...",Proceedings Paper


# IEEE

In [14]:
ieee = pd.read_csv('raw_data/IEEE.csv')

ieee.head()

Unnamed: 0,Document Title,Authors,Author Affiliations,Publication Title,Date Added To Xplore,Publication Year,Volume,Issue,Start Page,End Page,...,Mesh_Terms,Article Citation Count,Patent Citation Count,Reference Count,License,Online Date,Issue Date,Meeting Date,Publisher,Document Identifier
0,Wi-Fi Based Indoor Positioning System with Usi...,S. Gïney; A. Erdoğan; M. Aktaş; M. Ergün,"Başkent University,Department of Electrical an...",2020 43rd International Conference on Telecomm...,11 Aug 2020,2020,,,225,228,...,,2.0,,12.0,,11 Aug 2020,,,IEEE,IEEE Conferences
1,Machine Learning for the Estimation of WiFi Fi...,G. Tognola; D. Plets; E. Chiaramello; S. Gallu...,"National Research Council, Institute of Electr...",2021 XXXIVth General Assembly and Scientific S...,14 Oct 2021,2021,,,1,4,...,,1.0,,16.0,,14 Oct 2021,,,IEEE,IEEE Conferences
2,Multi-Detector Deep Neural Network for High Ac...,C. -Y. Chen; A. I. -C. Lai; R. -B. Wu,"National Taiwan University,Dept. Electrical En...",2021 IEEE Topical Conference on Wireless Senso...,27 Apr 2021,2021,,,37,39,...,,1.0,,10.0,,27 Apr 2021,,,IEEE,IEEE Conferences
3,Wi-Fi intrusion detection using weighted-featu...,M. E. Aminanto; H. C. Tanuwidjaja; P. D. Yoo; ...,"School of Computing, KAIST, Daejeon, South Kor...",2017 International Workshop on Big Data and In...,1 Feb 2018,2017,,,99,104,...,,3.0,,25.0,,1 Feb 2018,,,IEEE,IEEE Conferences
4,A Neural Network Based Handover for Multi-RAT ...,M. Rihani; M. Mroue; J. Prevotct; F. Nouvel; Y...,"IETR-INSA, Rennes, France; Faculty of Engineer...",2018 13th International Symposium on Reconfigu...,30 Aug 2018,2018,,,1,6,...,,1.0,,12.0,,30 Aug 2018,,,IEEE,IEEE Conferences


In [15]:
len(ieee)

1355

In [16]:
ieee['Document Identifier'].value_counts()

IEEE Conferences                   1028
IEEE Journals                       259
IEEE Early Access Articles           30
IEEE Magazines                       25
Wiley-IEEE Press eBook Chapters       5
VDE Conferences                       3
IET Conferences                       2
KICS Journals                         1
PTP Journals                          1
TUP Journals                          1
Name: Document Identifier, dtype: int64

In [17]:
ieee.columns

Index(['Document Title', 'Authors', 'Author Affiliations', 'Publication Title',
       'Date Added To Xplore', 'Publication Year', 'Volume', 'Issue',
       'Start Page', 'End Page', 'Abstract', 'ISSN', 'ISBNs', 'DOI',
       'Funding Information', 'PDF Link', 'Author Keywords', 'IEEE Terms',
       'INSPEC Controlled Terms', 'INSPEC Non-Controlled Terms', 'Mesh_Terms',
       'Article Citation Count', 'Patent Citation Count', 'Reference Count',
       'License', 'Online Date', 'Issue Date', 'Meeting Date', 'Publisher',
       'Document Identifier'],
      dtype='object')

In [18]:
ieee.iloc[0]['Authors'], scopus_tot.iloc[0]['Authors']

('S. Gïney; A. Erdoğan; M. Aktaş; M. Ergün',
 'Gu Y., Wang Y., Wang M., Pan Z., Hu Z., Liu Z., Shi F., Dong M.')

In [19]:
# convert authors name as in Scopus
def conv_auth_names_scop(x):
    if type(x) != str:
        return x
    authors = x.split(';')
    res = ''
    for i, el in enumerate(authors):
        last_space_idx = el.rfind(' ')
        if i == 0:
            new_auth = f'{el[last_space_idx:]} {el[:last_space_idx]}'   # there is no space before initials
        else:
            new_auth = el[last_space_idx:] + el[:last_space_idx]
        res += new_auth
        if i != len(authors) - 1:
            res += ','  # additional space not needed 
    return res

conv_auth_names_scop(ieee.iloc[0]['Authors'])

' Gïney S., Erdoğan A., Aktaş M., Ergün M.'

In [20]:
ieee['Authors'] = ieee['Authors'].map(lambda x: conv_auth_names(x))

In [21]:
# scopus rel columns ['Title', 'Year', 'Cited by', 'Abstract', 'Author Keywords', 'Authors', 'Document Type']
ieee.rename(columns={'Document Title':'Title', 'Publication Year':'Year', 'Article Citation Count':'Cited by', 'Document Identifier':'Document Type'}, inplace=True)

ieee = ieee[scopus_rel_columns]
ieee.head()

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type
0,Wi-Fi Based Indoor Positioning System with Usi...,2020,2.0,Indoor positioning is one of the major challen...,Indoor Positioning;Deep Neural Networks;Classi...,"S. Gïney , A. Erdoğan , M. Aktaş , M. Ergün",IEEE Conferences
1,Machine Learning for the Estimation of WiFi Fi...,2021,1.0,This paper presents the preliminary results on...,,"G. Tognola , D. Plets , E. Chiaramello , S. Ga...",IEEE Conferences
2,Multi-Detector Deep Neural Network for High Ac...,2021,1.0,A Deep Neural Network (DNN)-based positioning ...,machine learning;deep neural network;indoor na...,"C. -Y. Chen , A. I. -C. Lai , R. -B. Wu",IEEE Conferences
3,Wi-Fi intrusion detection using weighted-featu...,2017,3.0,Feature learning plays an important role in im...,Intrusion detection system;Wi-Fi network;featu...,"M. E. Aminanto , H. C. Tanuwidjaja , P. D. Yoo...",IEEE Conferences
4,A Neural Network Based Handover for Multi-RAT ...,2018,1.0,The wireless communication networks continue t...,Heterogeneous Networks;Reconfigurable Architec...,"M. Rihani , M. Mroue , J. Prevotct , F. Nouvel...",IEEE Conferences


# Merge DFs

In [22]:
scopus_tot.head()

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type
0,Secure User Authentication Leveraging Keystrok...,2022,,User authentication plays a critical role in a...,Behavioral features; channel state information...,"Gu Y., Wang Y., Wang M., Pan Z., Hu Z., Liu Z....",Article
1,Data-driven based HVAC optimisation approaches...,2022,1.0,"Improving the energy efficiency of Heating, Ve...",Air conditioning (HVAC) systems; Artificial in...,"Ala'raj M., Radi M., Abbod M.F., Majdalawieh M...",Review
2,Enabling efficient WiFi-based occupant behavio...,2022,,To launch energy-efficient and occupant-centri...,Deep learning; Internet of things; Occupant be...,"Zhou Q., Yang Q., Xing J.",Article
3,I-WKNN: Fast-speed and high-accuracy WIFI posi...,2022,,Based on various existing wireless fingerprint...,AP selection; Asymmetric Gaussian filtering; I...,"Zhao Z., Lou Z., Wang R., Li Q., Xu X.",Article
4,Using WiFi connection counts and camera-based ...,2022,1.0,Accurate occupancy information can help in opt...,Building occupancy counts; Day-ahead occupancy...,"Alishahi N., Ouf M.M., Nik-Bakht M.",Article


In [23]:
df = pd.concat((scopus_tot, wos_tot, ieee))
df['Title'] = df['Title'].map(lambda x: x.lower())

len(df)

6227

In [24]:
df.drop_duplicates(subset='Title', inplace=True)
len(df)

3609

# Drop Conferences and similar

In [25]:
df.sort_values(by='Title')

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type
1093,(relbt): a reinforcement learning-enabled list...,2020.0,12.0,The emergence of Internet of Things (IoT) has ...,Listen before talk (LBT); LTE-LAA; LTE-LAA WiF...,"Ali R., Kim B., Kim S.W., Kim H.S., Ishmanov F.",Article
1669,10th eai international conference on broadband...,2019.0,,The proceedings contain 19 papers. The special...,,[No author name available],Conference Review
1682,10th eai international conference on mobile co...,2019.0,,The proceedings contain 17 papers. The special...,,[No author name available],Conference Review
1756,10th international conference on ad hoc networ...,2019.0,,The proceedings contain 27 papers. The special...,,[No author name available],Conference Review
888,"11th international conference on security, pri...",2018.0,,The proceedings contain 45 papers. The special...,,[No author name available],Conference Review
...,...,...,...,...,...,...,...
1810,zero-cost and map-free shop-level localization...,2018.0,4.0,"In recent years, with the rapid development of...",Crowdsourcing fingerprints; Ensemble learning;...,"Wei J., Zhou X., Zhao F., Luo H., Ye L.",Conference Paper
318,zifind: exploiting cross-technology interferen...,2013.0,59.0,Indoor localization becomes increasingly impor...,,"Gao Y., Niu J., Zhou R., Xing G.",Conference Paper
507,zigbee home automation localization system,2016.0,,"In this paper, a localization system of mobile...",Artificial neural network; RF fingerprint; RSS...,"Rillo H., Marco Á., Blasco R., Casas R.",Conference Paper
58,zigbee wireless network application research c...,2009.0,0.0,Using Zigbee techniques as a back bone to deve...,Zigbee; U-campus; Wireless net work,Huang C.P.,Proceedings Paper


In [26]:
def check_conference(title):
    names = ['conference', 'workshop', 'symposium', 'meeting', 'forum']
    for el in names:
        if el in title:
            return True
    return False

idxs = [not check_conference(title) for title in df['Title']]

df = df.loc[idxs]
len(df)

3449

In [27]:
df[['Title', 'Abstract', 'Author Keywords']] = df[['Title', 'Abstract', 'Author Keywords']].fillna('')
df['Cited by'] = df['Cited by'].fillna(0)
df['text'] = df['Title'] + ' ' + df['Abstract'] + ' ' + df['Author Keywords']

In [28]:
df.head()

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type,text
0,secure user authentication leveraging keystrok...,2022.0,0.0,User authentication plays a critical role in a...,Behavioral features; channel state information...,"Gu Y., Wang Y., Wang M., Pan Z., Hu Z., Liu Z....",Article,secure user authentication leveraging keystrok...
1,data-driven based hvac optimisation approaches...,2022.0,1.0,"Improving the energy efficiency of Heating, Ve...",Air conditioning (HVAC) systems; Artificial in...,"Ala'raj M., Radi M., Abbod M.F., Majdalawieh M...",Review,data-driven based hvac optimisation approaches...
2,enabling efficient wifi-based occupant behavio...,2022.0,0.0,To launch energy-efficient and occupant-centri...,Deep learning; Internet of things; Occupant be...,"Zhou Q., Yang Q., Xing J.",Article,enabling efficient wifi-based occupant behavio...
3,i-wknn: fast-speed and high-accuracy wifi posi...,2022.0,0.0,Based on various existing wireless fingerprint...,AP selection; Asymmetric Gaussian filtering; I...,"Zhao Z., Lou Z., Wang R., Li Q., Xu X.",Article,i-wknn: fast-speed and high-accuracy wifi posi...
4,using wifi connection counts and camera-based ...,2022.0,1.0,Accurate occupancy information can help in opt...,Building occupancy counts; Day-ahead occupancy...,"Alishahi N., Ouf M.M., Nik-Bakht M.",Article,using wifi connection counts and camera-based ...


# Preprocess the text

In [29]:
config = {
    'remove_punct' : True,
    'remove_num' : True,
    'remove_stopwords' : True,
    'custom_stopwords' : ["proceeding", "proceedings", "proceed", "learn", "learning", "technique", "paper", "papers", "study", "conference", "analysis", "research", "wi", "fi", "wifi", "wi-fi", "ieee", "performance", "©", "all", "rights", "right", "reserve", "reserved", "method", "datum", "data", "network", "networks", "topic", "contain", "wireless", "propose", "system", "base", "network", "declaration", "interest"],
    'lemmatize' : True
}

In [30]:
import spacy

nlp = spacy.load('en_core_web_sm')

def preprocess_txt(text):
    text = text.lower()
    doc = nlp(text)
    if config['remove_punct']:
        doc = [token for token in doc if not token.is_punct]
    if config['remove_num']:
        doc = [token for token in doc if not token.is_digit]
    if config['remove_stopwords']:
        doc = [token for token in doc if not token.is_stop and token.text not in config['custom_stopwords']]
    if config['lemmatize']:
        doc = [token.lemma_ for token in doc]   # .lemma_ is a string
    
    result = ''
    for text in doc:
        result += text + ' '
    
    return result.strip()

  from .autonotebook import tqdm as notebook_tqdm


In [31]:
df['text'] = df['text'].apply(lambda text: preprocess_txt(text))

In [32]:
df

Unnamed: 0,Title,Year,Cited by,Abstract,Author Keywords,Authors,Document Type,text
0,secure user authentication leveraging keystrok...,2022.0,0.0,User authentication plays a critical role in a...,Behavioral features; channel state information...,"Gu Y., Wang Y., Wang M., Pan Z., Hu Z., Liu Z....",Article,secure user authentication leverage keystroke ...
1,data-driven based hvac optimisation approaches...,2022.0,1.0,"Improving the energy efficiency of Heating, Ve...",Air conditioning (HVAC) systems; Artificial in...,"Ala'raj M., Radi M., Abbod M.F., Majdalawieh M...",Review,drive base hvac optimisation approach systemat...
2,enabling efficient wifi-based occupant behavio...,2022.0,0.0,To launch energy-efficient and occupant-centri...,Deep learning; Internet of things; Occupant be...,"Zhou Q., Yang Q., Xing J.",Article,enable efficient base occupant behavior recogn...
3,i-wknn: fast-speed and high-accuracy wifi posi...,2022.0,0.0,Based on various existing wireless fingerprint...,AP selection; Asymmetric Gaussian filtering; I...,"Zhao Z., Lou Z., Wang R., Li Q., Xu X.",Article,wknn fast speed high accuracy positioning inte...
4,using wifi connection counts and camera-based ...,2022.0,1.0,Accurate occupancy information can help in opt...,Building occupancy counts; Day-ahead occupancy...,"Alishahi N., Ouf M.M., Nik-Bakht M.",Article,connection count camera base occupancy count e...
...,...,...,...,...,...,...,...,...
1340,table of contents,2016.0,0.0,The following topics are dealt with: power-awa...,,,IEEE Conferences,table content follow topic deal power aware st...
1341,[front cover],2007.0,0.0,The following topics are dealt with: image ind...,,,IEEE Conferences,cover follow topic deal image indexing content...
1345,contents,2015.0,0.0,The following topics are dealt with: language ...,,,IEEE Conferences,content follow topic deal language processing ...
1348,[front and back cover],2012.0,0.0,The following topics are dealt with: medical i...,,,IEEE Conferences,cover follow topic deal medical image watermar...


# Save the result

In [33]:
df.to_csv('ML_WIFI_preprocessed.csv', index=False)