# Data Preprocessing

## 1. Break interested contents into Corpus
In this research, the fields that we interested in were Titles, Abstracts, Keywords and WOS Keywords. Extract these fields into corpus by dividing them using space and lower all the words, moreover, the punctuations were deleted in this step.

In [None]:
import pandas as pd
from string import *

def delpunc(kstring):#Accepting STRINGS for puncutation deletion
    for punc in punctuation:
        kstring = kstring.replace(punc,' ')
    return(kstring)

def BreakintoCorpus(data,minwordlen): #Delete punctuation and short words, split sentence into Corpus Lists
    result = [word.lower() for word in delpunc(data).split(' ') if len(word) > minwordlen]
    return(result)

Records = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Complete_Records_WOS_20171111.csv', encoding = u'utf-8')
Records = Records.drop('Unnamed: 0', axis = 1)

Title_Corpus = [BreakintoCorpus(title,0) for title in Records.TI]
Abstract_Corpus = [BreakintoCorpus(str(abstract),0) for abstract in Records.AB] #str(abstracts) in case of NaN in most abstracts
Keyword_Corpus = [str(keyword).split('; ') for keyword in Records.DE]
WOSword_Corpus = [str(keyword).split('; ') for keyword in Records.ID]

# 2. Lemmatization and Stopword deletion

First, delete all the stopwords in both corpuses using nltk.stopwords.words('english'), then lemmatize the corpus using nltk.WordNetLemaatizer (Attention, this step takes long time and delete stopwords first will accelerate this process.

Attention: special words like 'up' and 'down' will be restored, for they are useful in latter searches ("Bottom-up" and "Top-down" control)

In [3]:
#Function Declaration
def DelStopwords(data, UniBi): #For stopwords deletion in Corpus files
    #For excat term "Bottom-up and Top-down"
    stpwds = stopwords.words('english')
    stpwds.remove('up')
    stpwds.remove('down')
    
    result = []
    if UniBi == 'Sent':
        for record in data:
            result.append([word for word in record if word not in stpwds])
    if UniBi == 'Uni':
        result = [word for word in data if word not in stpwds]
    return(result)

def Lemm(data, ps):#Data must be whole corpus, contains mutiple word Lists [[x,x,x],[x,x,x],...,[x,x,x]]Style
    result = []
    wnl = nltk.WordNetLemmatizer()
    if ps == 'v' or ps == 'n':
        for record in data:
            result.append([str(wnl.lemmatize(word, pos = ps)) for word in record]) #POS = 'v' make this lemmatizer process verbs and 'n' for nouns, Recommend both.
    if ps == 'all':
        for record in data:
            tempresult = []
            tempresult = [str(wnl.lemmatize(word, pos = 'n')) for word in record]
            result.append([str(wnl.lemmatize(word, pos = 'v')) for word in tempresult]) #Make both pos lemmatized
    return(result)

In [9]:
#Stopwords deletion
Title_Corpus = DelStopwords(Title_Corpus, 'Sent')
Abstract_Corpus = DelStopwords(Abstract_Corpus, 'Sent')

#Lemmalitization
Title_Corpus = Lemm(Title_Corpus, 'all')
Abstract_Corpus = Lemm(Abstract_Corpus, 'all')

Save this contemporary files into local disk.

In [None]:
def OutputFiles(data, FileDirectory, UniBi): #For Signle word and Sentence Outputing
    Fileoutput = open(FileDirectory, 'w', encoding = u'utf-8')
    if UniBi == 'Uni':
        for record in data:
            print(record, file = Fileoutput)
    if UniBi == 'Bi':
        for record in data:
            print(record[0], record[1], file = Fileoutput)    
    if UniBi == 'Sent':
        for record  in data:
            print(Connect(record), file = Fileoutput)
    Fileoutput.close()
    
#Output
OutputFiles(Title_Corpus,'D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_Title_Lemmstp.txt','Sent')
OutputFiles(Abstract_Corpus,'D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_Abstract_Lemmstp.txt','Sent')

# 3. Bigram,Trigram selection and corpus refining

Here in this step, we'll refining the corpus by selecting important bigrams and trigrams using TF-IDF, and then replace them back into the original corpus.
Top 10000 grams were chosen and saved to local disk, future replacement will be deployed inside these lists.
Attention: considering the requirement of word indexing in continuous work, bigger n-grams are not calculated for their are way too time consuming.

In [None]:
#Here is the function declaration
def CombineBigrams(Bgrams):
    result = []
    for sentence in Bgrams:
        temp = []
        for word in sentence:
            temp.append((word[0] + '-' + word[1]))
        if temp != []:
            result.append(temp)
    return(result)

def CombineTrigrams(Tgrams):
    result = []
    for sentence in Tgrams:
        temp = []
        for word in sentence:
            temp.append((word[0] + '-' + word[1] + '-' + word[2]))
        if temp != []:
            result.append(temp)
    return(result)

def ReturnTfidfRank(Sentences,Top):
    dictionary = gensim.corpora.Dictionary(Sentences)
    corpus = [dictionary.doc2bow(sentence) for sentence in Sentences]
    tfidf = gensim.models.TfidfModel(corpus)
    TokenList = []
    for document in corpus:
        if document != []:
            TokenList.append(str(dictionary.get(sorted(tfidf[document],key = lambda record: record[1], reverse = True)[0][0])))
    result = [word for word in Sort_Rebuild(FreqDist(TokenList))][0:Top]
    return(result)

def Sort_Rebuild(Freq):#Rebuild and Sort FreqDist files, make it easier to check
    Freq_Rebuild = [[word,Freq[word]] for word in Freq]
    Freq_Sorted = sorted(Freq_Rebuild, key = lambda record: record[1], reverse = True)
    return(Freq_Sorted)

Load the corpus data for replacement:

In [None]:
Title_Corpus = ReloadCorpus('D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_Title_Lemmstp.txt')
Abstract_Corpus = ReloadCorpus('D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_Abstract_Lemmstp.txt')
#Combine the title and abstract corpus
Lemm_Sentences = Title_Corpus + Abstract_Corpus

### For test, we'll trill replace the trigrams first, save top 10000 grams to local disk. THIS IS JUST A TEST.

In [None]:
Trigram = []
for sentence in Lemm_Sentences:
    Trigram.append(list(nltk.trigrams(sentence)))
    
Trigram = CombineTrigrams(Trigram)
Trigram_Top = ReturnTfidfRank(Trigram, 10000)
Trigram_Top_List = [word[0] for word in Trigram_Top]
OutputFiles(Trigram_Top_List,'D:/_Research/Project_Ecological_Development/Data_Processing/Top10000_Trigrams_TFIDF.txt','Uni')

Trigram_Top_List = [word[0] for word in Trigram_Top][:1000]

Bigrams are actually processed in our research, they are important and crutial in latter analysis.

In [None]:
#Make Bigrams
Bgrams = []
for sentence in Lemm_Sentences:
    Bgrams.append(list(nltk.bigrams(sentence)))

#Chosing using TF-IDF
Bgrams = CombineBigrams(Bgrams)
Bgram_Top_List = ReturnTfidfRank(Bgrams, 10000)
Bgram_Top_List = [word[0] for word in Bgram_Top]
OutputFiles(Bgram_Top_List,'D:/_Research/Project_Ecological_Development/Data_Processing/Top10000_Bgrams_TFIDF.txt','Uni')

Replace bigrams back into the corpus using loop, Here, we changed the loop style into 'temp in Bgram_Top_List', which made it nearly 10000 times faster than original compare by new hash table comparing.
Attention, the time consumed by this step is mainly controlled by the volume of Bgram_Top_List, at the complexity of O(n). Here we choose 2000 as the threshold after mannual inspectation of the list.
#### Here, for better defeined ECOLOGICAL bigram in word meaning, we manually checked the input words and added some bigrams that mentioned in the later concept word table.

In [1]:
#Mannual added bigrams 
Mannual_Bigrams = ['birth-rate','competitive-exclusion','energy-efficiency','founder-effect','intermediate-disturbance','intrinsic-grwoth','lotka-volterra',
                   'nutrient-cycle','red-queen','specie-evenness','secondary-productivity','doubling-time','nutrient-pool','nutrient-turnover','energy-flow',
                   'equilibrium-model','evolutionary-divergence','evolutionary-convergence','exponential-growth','exponential-increase','hardy-weinberg',
                   'intrinsic-increase','k-r','nutrient-spiraling','patch-matrix','selection-type','secondary-productivity','specie-area','specie-category',
                   'trophic-pyramid','density-independent','residence-time','island-biogeography','faciliation-model','tolerance-model','inhibition-model',
                   'competition-model','stable-state','ecosystem-production','biogeochemical-cycle','assemblage-richness','assemblage-evenness','specie-dispersion']

In [1]:
Bgram_Top_List = [word[0] for word in Bgram_Top][:2000] + Mannual_Bigrams
#Bigram replacement
C_Lemm_Sentences = Lemm_Sentences
for i in range(0,len(Lemm_Sentences)):
    for j in range (0,len(Lemm_Sentences[i])-1):
        temp = Lemm_Sentences[i][j] + '-' + Lemm_Sentences[i][j+1]
        if (temp in Bgram_Top_List):
            C_Lemm_Sentences[i][j] = temp
            C_Lemm_Sentences[i][j+1] = ''

NameError: name 'Bgram_Top' is not defined

In [None]:
OutputFiles(C_Lemm_Sentences,'D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_TiAb_BiReplaced.txt','Sent')