### Imports

In [44]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import re
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Accessing Paper Titles

Unlabelled data

In [24]:
# Define the path to OAuth2 credentials JSON file
creds_path = r'C:\Users\cathe\practical-data-science-tutorial\src\data\credentials.json'

# Define the name of Google Sheet
google_sheet_name = 'Draft-dataset'

# Define the name of the specific sheet within the Google Sheet
specific_sheet_names = ['Health and medical sciences', 'Social sciences', 'Business, economics and management']

# Initialize the Google Sheets client
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(creds_path, scope)
client = gspread.authorize(creds)

papers_unlabelled = []

for specific_sheet_name in specific_sheet_names:
    # Open the Google Sheet
    sheet = client.open(google_sheet_name).worksheet(specific_sheet_name)

    # Get list of article names from the sheet
    papers = sheet.col_values(3)[1:]

    # Add all article names to a single list
    papers_unlabelled.extend(papers)

In [25]:
papers_unlabelled[:3]

['The psychological impact of quarantine and how to reduce it: rapid review of the evidence',
 'Global, regional, and national incidence, prevalence, and years lived with disability for 354 diseases and injuries for 195 countries and territories, 1990–2017: a systematic analysis for the Global Burden of Disease Study 2017',
 'A novel coronavirus outbreak of global health concern']

Labelled data

In [26]:
# Define the name of Google Sheet
google_sheet_name = 'Draft-dataset'

# Define the name of the specific sheet within the Google Sheet
specific_sheet_name = 'Copy of finaldataset'

# Initialize the Google Sheets client
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
creds = ServiceAccountCredentials.from_json_keyfile_name(creds_path, scope)
client = gspread.authorize(creds)

# Open the Google Sheet
sheet = client.open(google_sheet_name).worksheet(specific_sheet_name)

# Get list of article names & labels from the sheet
papers_labelled = sheet.col_values(3)[1:]
labels = sheet.col_values(15)[1:]

In [55]:
labels = [int(i) for i in labels]

In [56]:
papers_labelled[:3], labels[:3]

(['COVID-19: the gendered impacts of the outbreak',
  'COVID-19: towards controlling of a pandemic',
  'Prevention and treatment of low back pain: evidence, challenges, and promising directions'],
 [3, 1, 3])

### Preprocessing

In [57]:
# Download NLTK data for stopwords and lemmatisation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Define stopwords to remove
stop_words = stopwords.words('english')
stop_words.extend(['covid19', 'sarscov2'])

# Initialize lemmatiser
lemmatizer = WordNetLemmatizer()

# Map POS tags to WordNet tags for lemmatisation
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    else:
        return None

def preprocess_title(title):
    # Remove non-alphanumeric characters
    title_alpha = re.sub(r'[^a-zA-Z0-9\s]', '', title)

    # Convert to lowercase
    title_lower = title_alpha.lower()

    # Break down into individual words
    words = word_tokenize(title_lower)

    # Remove stopwords
    # words = [word for word in words if word not in stop_words]

    # Part-of-Speech (POS) tagging
    tagged_words = pos_tag(words)

    # Remove words that are nouns (NN, NNS, NNP, NNPS)
    non_noun_words = [word for word, pos in tagged_words if not pos.startswith('N')]

    # Lemmatisation of words & removal of words that aren't adjectives/verbs/adverbs
    lemmatised_words = [lemmatizer.lemmatize(word) for word in non_noun_words]

    return lemmatised_words

# Apply preprocessing to all titles
papers_labelled_processed = [preprocess_title(title) for title in papers_labelled]
papers_unlabelled_processed = [preprocess_title(title) for title in papers_unlabelled]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cathe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [58]:
papers_labelled_processed[:3]

[['covid19', 'the', 'gendered', 'of', 'the'],
 ['controlling', 'of', 'a', 'pandemic'],
 ['and', 'of', 'low', 'back', 'and', 'promising']]

In [59]:
papers_unlabelled_processed[:3]

[['the',
  'psychological',
  'of',
  'and',
  'how',
  'to',
  'reduce',
  'it',
  'rapid',
  'of',
  'the'],
 ['global',
  'regional',
  'and',
  'national',
  'and',
  'lived',
  'with',
  'for',
  '354',
  'and',
  'for',
  '195',
  'and',
  '19902017',
  'a',
  'systematic',
  'for',
  'the',
  'global',
  'of',
  '2017'],
 ['a', 'novel', 'of', 'global']]

### Bag of Words Representation

In [60]:
# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer on preprocessed titles and transform them into a BoW representation
labelled_bow = vectorizer.fit_transform([" ".join(title) for title in papers_labelled_processed])

In [61]:
labelled_bow

<100x288 sparse matrix of type '<class 'numpy.int64'>'
	with 623 stored elements in Compressed Sparse Row format>

In [62]:
# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer on preprocessed titles and transform them into a BoW representation
unlabelled_bow = vectorizer.fit_transform([" ".join(title) for title in papers_unlabelled_processed])

In [63]:
unlabelled_bow

<1242x1815 sparse matrix of type '<class 'numpy.int64'>'
	with 7843 stored elements in Compressed Sparse Row format>

### Random Forest

In [66]:
# Split the dataset into a training set and a validation set
X_train, X_val, y_train, y_val = train_test_split(labelled_bow, labels, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = rf_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {accuracy}")

# Predict labels for unlabelled data
Predicted_scores = rf_model.predict(unlabelled_bow)


Validation accuracy: 0.55
