In [1]:
import pandas as pd
dr = pd.read_pickle('data/covid19_reference_count.pickle')
title = pd.read_pickle('data/covid19_title.pickle')

In [2]:
# pick up reference count more than 30 times
title30 = title[dr>=30]
title30.keys

<bound method Series.keys of 1        [Population-Based Study of the Influence of th...
13       [Immunophenotyping of COVID-19 and influenza h...
18       [Incidental COVID-19 related lung apical findi...
32       [Studies of Novel Coronavirus Disease 19 (COVI...
42       [Infodemiological study on COVID-19 epidemic a...
58       [Clinical Features of COVID-19 in Patients Wit...
67       [Coping with COVID-19: Exposure to COVID-19 an...
95       [Characteristics associated with hospitalisati...
104      [The Need for the Right Socio-Economic and Cul...
109      [COVID-19, Australia: Epidemiology Report 7: R...
114      [Detection dogs as a help in the detection of ...
117      [COVID-19, Australia: Epidemiology Report 3: R...
125      [COVID-19, Australia: Epidemiology Report 19 (...
141                  [A scientometric overview of CORD-19]
150      [COVID-19: to be or not to be; that is the dia...
170                       [Emerging Neurology of COVID-19]
171          [COVID-19 and 

## Text pre-processing - data cleaning part 1

* Make text all lower case
* Remove punctuation
* Remove numerical values
* Remove common non-sensical text (/n)

In [3]:
data_text = title30.astype('str')

In [4]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [5]:
# Let's take a look at the updated text
clean_data1 = data_text.apply(round1)

In [6]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [7]:
# Let's take a look at the updated text
clean_data2 = clean_data1.apply(round2)

## Organizing the data

We will organise data into two standard text format:

1. Corpus - a collection of text
2. Document-Term Matrix - words counts in matrix format

### Corpus

We already created a corpus in an earlier step. The definition of a corpus is a collection of texts, and they are all put together neatly in a pandas series here.

In [11]:
# Let's pickle it for later use
data_text.to_pickle("data/title30_corpus.pkl")

### Document-Term Matrix

We need to tokenize text, meaning break down into smaller pieces. The most common tokenization technique is to break down text into words. We can do this using scikit-learn's CountVectorizer, where every row will represent a different document and every column will represent a different word.

In addition, with CountVectorizer, we can remove stop words. Stop words are common words that add no additional meaning to text such as 'a', 'the', etc.

In [16]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(clean_data2)
data_cv

<4720x7786 sparse matrix of type '<class 'numpy.int64'>'
	with 37280 stored elements in Compressed Sparse Row format>

In [20]:
# convert to a numerical array
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = clean_data2.index
# Let's pickle it for later use
data_dtm.to_pickle("data/title30_dtm.pkl")

In [21]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
clean_data2.to_pickle('data/title30_clean_data2.pkl')
pickle.dump(cv, open("data/title30_cv.pkl", "wb"))