In [2]:
import pandas as pd
import gensim
import fitz
import os
from pdfminer.high_level import extract_text
from tqdm import tqdm
import numpy as np
import string
import copy
import regex as re
import unstructured
from unstructured.cleaners.core import replace_unicode_quotes, clean_non_ascii_chars
import enchant
from TxtProcessing2 import TextPreprocessing as tp
from TxtProcessing2 import TokenProcessor as tkp

# Upload the papers & Cleaning steps

The scope of this notebook is to perform several pre-processing steps in order to remove noise from the text that could lead to undesireable results in training Word2Vec. TextPreprocessing is a class containing all pre-processing and cleaning functions.

**Cleaning steps:**
1. Mantain only the core text for each papers (i.e. remove all text before 'Abstract' and all text after 'References' or 'Results').
2. Remove any 'formatting' character.
3. Remove punctuation.
4. Remove numbers (for the purpose of the project we don't need numbers).
5. Remove stopwords.
6. Tokenize and lemmatize text.

One final step is to check whether the tokens correspond to real words in the english language. Three cases are evaluated:
1. When the token is a real word.
2. When the union of two consecutive tokens represents a word.
3. When the split of two or more consecutive tokens represents two or more words.

In [2]:
path = 'Desktop/Università/TESI/training_data'

In [3]:
tp = tp(path)

In [None]:
papers = tp.extract_text_from_pdfs()

In [None]:
type(papers)

In [None]:
# Save
np.save('papers_per_year.npy', papers)

 **Upload the extracted file**.

In [4]:
# Load
papers_per_year = np.load('papers_per_year.npy',allow_pickle='FALSE').item()

In [None]:
papers_per_year.keys()

In [5]:
wordlist = ['REFERENCES', 'R E F E R E N C E S', 'r e f e r e n c e s']
wordtoreplace = 'References'
papers_per_year = tp.restructure_words(papers_per_year,wordlist, wordtoreplace)

In [6]:
wordlist = ['ABSTRACT', 'A B S T R A C T', 'a b s t r a c t']
wordtoreplace = 'Abstract'
new_papers = tp.restructure_words(papers_per_year, wordlist, wordtoreplace)

In [7]:
wordlist = ['RESOURCES', 'R E S O U R C E S', 'r e s o u r c e s']
wordtoreplace = 'Resources'
new_papers = tp.restructure_words(new_papers, wordlist, wordtoreplace)

In [None]:
wordlist = ['INTRODUCTION', 'I N T R O D U C T I O N', 'i n t r o d u c t i o n']
wordtoreplace = 'Introduction'
new_papers = tp.restructure_words(new_papers, wordlist, wordtoreplace)

In [None]:
count = 0
for year, papers in papers_per_year.items(): 
    for text in papers: 
        if 'REFERENCES' in text: 
            count += 1

print(count)

In [None]:
new_papers = tp.extract_core(new_papers, 'Abstract', 'References')

In [None]:
new_papers = tp.remove_end_from_txt(new_papers, 'Resources')

In [None]:
new_papers = tp.remove_end_from_txt(new_papers, 'Copyright')

In [None]:
new_paperss= tp.remove_empty_text(new_papers)

In [None]:
new_paperss = tp.unite_segmented_words(new_paperss)

In [None]:
patterns = [
    r'\x0c?\s*(Page\s+\d+.*?\n+|Copyright © [^\x0c]+?\x0c\d+\n\n.+?\n\n)', 
    r'\n(?:See the Terms and Conditions[^\n]+|by University Of[^\n]+|OA articles are governed[^\n]+)\n',  
    r'(?:\* Corresponding author\..+?\(C\. Gao\)\.)',
    r'((?:[0-9]{4}-[0-9]+\/\$.*?\sltd\.)|(E-mail address:.+?\(C\. Gao\)\.))',  
    r'\b(Wiley|ERP Environment|Sust Dev)\b[\s\d–-]+',  
    r'Downloaded from .+?(?=See the Terms and Conditions)',  
    r'See the Terms and Conditions on .+? for rules of use',  
    r'\b[e|i] chapter\b',
    r'(?i)\bet\s+al\.\b',  
]

In [None]:
replace_list = ['\n', '\t', '\x0c','‘','’', '“', '”', '©', 'et. al', 'cid', 'doi', 'DOI', '—']

In [None]:
new_paper = tp.comprehensive_cleaning(new_paperss, replace_list, patterns)

In [None]:
for liste in new_paper.values():
    for i, lista in enumerate(liste):
        if 'cid' in lista:
            x = lista.replace('cid', '')
            new_paper[year][i] = x

In [None]:
count = 0
for liste in new_paper.values():
    for i, lista in enumerate(liste):
        if 'cid' in lista:
            count += 1
print(count)

In [None]:
pre_tokens = {}

for year, paper in new_paper.items():
    for i, txt in enumerate(paper):
        if 'circulareconomy' in txt:
            new_txt = txt.replace('circulareconomy', 'circular economy')
        else:
            new_txt = txt
        pre_tokens.setdefault(year, []).append(new_txt)

In [None]:
for year, paper in pre_tokens.items():
    for i, txt in enumerate(paper):
        word = txt.split()
        for j in range(len(word) - 1): 
            if word[j] == 'cir' and word[j + 1] == 'cular':
                word[j:j+1] = 'circular'
        
        pre_tokens[year][i] = ' '.join(word)

In [None]:
for year, paper in pre_tokens.items():
    for i, txt in enumerate(paper):
        word = txt.split()
        for j in range(len(word) - 1): 
            if word[j] == 'sus' and word[j + 1] == 'tainable':
                word[j:j+1] = 'sustainable'
        pre_tokens[year][i] = ' '.join(word)

In [None]:
token_text=tp.tokenize_and_lemmatize(pre_tokens, 'lemma')

In [None]:
mesi = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']

In [None]:
processor = tkp(token_text, mesi)
nuovo_token_text = processor.process_tokens()

In [None]:
for year in nuovo_token_text:
    nuovo_token_text[year] = [[word for word in sub_list if len(word) >= 3] for sub_list in nuovo_token_text[year]]

In [None]:
for year in nuovo_token_text:
    nuovo_token_text[year] = [[word.replace('-', '') for word in sub_list] for sub_list in nuovo_token_text[year]]

In [None]:
count = 0
for year, liste in nuovo_token_text.items():
    for lista in liste:
        if '-' in lista:
            count += 1
print(count)

In [None]:
for year, papers in nuovo_token_text.items():
    for i,lista in enumerate(papers):
        for word in lista:
            if len(word) < 3:
                print(i, word)

In [None]:
count = 0
for year, papers in nuovo_token_text.items():
    for i,lista in enumerate(papers):
        for j, word in enumerate(lista):
            if 'circulareconomy' in word:
                count +=1
print(count)

In [None]:
for year, papers in token_text.items():
    for i,lista in enumerate(papers):
        for word in lista:
            if len(word) < 3:
                print(i, word)

In [None]:
from nltk.corpus import stopwords

In [None]:
stop_words_list = stopwords.words("english")

In [None]:
final_tokens:
for year, liste in nuovo_token_text.items():
    for i, lista in liste:
        new_words = [word for word in lista if word not in stop_words_list]
        final_tokens[year][i] = new_words

In [None]:
final_tokens = {}

for year, liste in nuovo_token_text.items():
    final_tokens[year] = []
    for i, lista in enumerate(liste):  
        new_words = [word for word in lista if word not in stop_words_list]
        final_tokens[year].append(new_words) 

In [None]:
np.save('tokens_per_year.npy', final_tokens)