# Data analysis and data cleaning
The data set is checked for completeness and cleaned.

In [1]:
# imports
import pandas as pd
import regex as re

In [2]:
# reads csv as dataframe
df = pd.read_csv('fulltext_articles.csv')

In [12]:
df.head()

Unnamed: 0,url,title,abstract,fulltext
0,https://www.microbiologyresearch.org/content/j...,"Aliidiomarina halalkaliphila sp. nov., a haloa...",A haloalkaliphilic strain (IM 1326T) was isola...,"ANI, average nucleotide identity; ANIb, averag..."
1,https://www.microbiologyresearch.org/content/j...,"Pseudomonas germanica sp. nov., isolated from ...",Through bacterial plant–endophyte extraction f...,
2,https://www.microbiologyresearch.org/content/j...,"Halomonas antri sp. nov., a carotenoid-produci...","A Gram-negative, moderately halophilic bacteri...",
3,https://www.microbiologyresearch.org/content/j...,"Mameliella sediminis sp. nov., a novel polyhyd...","A Gram-stain-negative, strictly aerobic, non-m...","ANI, average nucleotide identity; dDDH, digita..."
4,https://www.microbiologyresearch.org/content/j...,"Geobacter benzoatilyticus sp. nov., a novel be...","A strictly anaerobic bacterial strain, designa...",


## Dataset analysis

In [13]:
# number of rows
len(df)

17006

In [14]:
# counts duplicate rows
print((df.duplicated()).sum())

0


In [15]:
# counts NaN values in colums
url_NaN = df['url'].isna().sum()
title_NaN = df['title'].isna().sum()
abstract_NaN = df['abstract'].isna().sum()
fulltext_NaN = df['fulltext'].isna().sum()

In [16]:
print(url_NaN,'\n', title_NaN,'\n', abstract_NaN,'\n', fulltext_NaN)

0 
 5 
 1680 
 2375


In [17]:
# calculates number of rows with content for every column
print(len(df)-url_NaN)
print(len(df)-title_NaN)
print(len(df)-abstract_NaN)
print(len(df)-fulltext_NaN)

17006
17001
15326
14631


In [18]:
# calculates NaN ratio of colums
url_NaN_ratio = url_NaN/len(df)
title_NaN_ratio = title_NaN/len(df)
abstract_NaN_ratio = abstract_NaN/len(df)
fulltext_NaN_ratio = fulltext_NaN/len(df)

In [19]:
print(url_NaN_ratio,'\n', title_NaN_ratio,'\n', abstract_NaN_ratio,'\n', fulltext_NaN_ratio)

0.0 
 0.00029401387745501585 
 0.09878866282488534 
 0.13965659179113254


## Dataset cleaning

In [21]:
# removes citation brackets 
df['fulltext'] =  [re.sub(r'\s[\(\[].*?[\)\]]','', str(x)) for x in df['fulltext']]

In [22]:
# removes dirty data like '.,,,' from text
df['fulltext'] = df['fulltext'].replace(regex = ['\.,{2,}'], value = '. ')

In [23]:
# finds dirty data like '\.,\w' and replaces it  ',\.'' with '\.'
df['fulltext'] = df['fulltext'].replace(regex = ['\.,(\w)'], value = r'. \1')

In [24]:
# saves df as csv
df.to_csv('cleaned_fulltext_articles.csv', index=False)