# Word2Vec model trainning & Ecological concepts(categories) clustering

Function definition and library import for this section, please run before progressing.

In [None]:
#Function definition and library import
import gensim, logging
import numpy as np
import nltk
import pandas as pd

from nltk.probability import FreqDist #Frequency Library for Freqdist
from nltk.corpus import stopwords
from string import punctuation

#Function Definintion
def BreakintoCorpus(data,minwordlen):#Delete punctuation and short words, split sentence into Corpus Lists
    result = [word.lower() for word in Delpunc(data).split(' ') if len(word) > minwordlen]
    return(result)

def CombineBigrams(Bgrams):#Combine [x,x] Bigram to 'x-x' style
    result = []
    for sentence in Bgrams:
        temp = []
        for word in sentence:
            temp.append((word[0] + '-' + word[1]))
        if temp != []:
            result.append(temp)
    return(result)

def CombineTrigrams(Tgrams):#Same as above
    result = []
    for sentence in Tgrams:
        temp = []
        for word in sentence:
            temp.append((word[0] + '-' + word[1] + '-' + word[2]))
        if temp != []:
            result.append(temp)
    return(result)

def Connect(Array):#For standard output
    string = ''
    for word in Array:
        string = string + word + ' '
    return(string)

def Delpunc(kstring):#Accepting STRINGS for puncutation deletion
    for punc in punctuation:
        kstring = kstring.replace(punc,' ')
    return(kstring)

def DelStopwords(data, UniBi): #For stopwords deletion in Corpus files
    #For excat term "Bottom-up and Top-down"
    stpwds = stopwords.words('english')
    stpwds.remove('up')
    stpwds.remove('down')
    
    result = []
    if UniBi == 'Sent':
        for record in data:
            result.append([word for word in record if word not in stpwds])
    if UniBi == 'Uni':
        result = [word for word in data if word not in stpwds]
    return(result)

def Lemm(data, ps):#Data must be whole corpus, contains mutiple word Lists [[x,x,x],[x,x,x],...,[x,x,x]]Style
    result = []
    wnl = nltk.WordNetLemmatizer()
    if ps == 'v' or ps == 'n':
        for record in data:
            result.append([str(wnl.lemmatize(word, pos = ps)) for word in record]) #POS = 'v' make this lemmatizer process verbs and 'n' for nouns, Recommend both.
    if ps == 'all':
        for record in data:
            tempresult = []
            tempresult = [str(wnl.lemmatize(word, pos = 'n')) for word in record]
            result.append([str(wnl.lemmatize(word, pos = 'v')) for word in tempresult]) #Make both pos lemmatized
    return(result)

def MatrixCombine(matrix,cmatrix): #Calculate the point-point OR result for two boolean array
    #For OR calculation using +, for AND calculation using *
    return(np.array(matrix) + np.array(cmatrix))

def OutputFiles(data, FileDirectory, UniBi): #For Signle word and Sentence Outputing
    Fileoutput = open(FileDirectory, 'w', encoding = u'utf-8')
    if UniBi == 'Uni':
        for record in data:
            print(record, file = Fileoutput)
    if UniBi == 'Bi':
        for record in data:
            print(record[0], record[1], file = Fileoutput)    
    if UniBi == 'Sent':
        for record  in data:
            print(Connect(record), file = Fileoutput)
    Fileoutput.close()

def PaperFreq(Word, Corpus): #Return the boolean Matrix of Word in Corpus
    if Word != '':
        matrix = [(Word in paper) for paper in Corpus]
    else:
        matrix = [False for i in range(len(Corpus))]
    return(matrix)
    
def ReloadCorpus(FileDirectory): #For Single Word and Sentence Reloading
    Fileinput = open(FileDirectory, encoding = u'utf-8')
    result = []
    for line in Fileinput:
        result.append(line.replace('\n','').split())
    Fileinput.close()
    return(result)

def ReturnTfidfRank(Sentences,Top):
    dictionary = gensim.corpora.Dictionary(Sentences)
    corpus = [dictionary.doc2bow(sentence) for sentence in Sentences]
    tfidf = gensim.models.TfidfModel(corpus)
    TokenList = []
    for document in corpus:
        if document != []:
            TokenList.append(str(dictionary.get(sorted(tfidf[document],key = lambda record: record[1], reverse = True)[0][0])))
    result = [word for word in Sort_Rebuild(FreqDist(TokenList))][0:Top]
    return(result)

def Sort_Rebuild(Freq):#Rebuild and Sort FreqDist files, make it easier to check
    Freq_Rebuild = [[word,Freq[word]] for word in Freq]
    Freq_Sorted = sorted(Freq_Rebuild, key = lambda record: record[1], reverse = True)
    return(Freq_Sorted)

## 1. Word2Vec trainning
Considering that our corpus is relatively small, we chose 100 dimension vectors for word presentation, Both CBOW and Skip-gram models are trainned, negative sampling is used to accelerate. The trainning algorithm is deployed under the framework of gensim, minimum threshold of word frequency is set to 1, for our corpus is relatively small, it's a waste to drop too much words.

In [None]:
Sentences = ReloadCorpus('D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_TiAb_BiReplaced.txt')
logging.basicConfig(format='%(asctime)s : %(levelname)s :%(message)s', level=logging.INFO)

model = gensim.models.Word2Vec(Sentences, size = 100, min_count = 1, sg = 1, hs = 1)
model.save('D:/_Research/Project_Ecological_Development/Data_Processing/SkipGram_HierSoftmax.model')

model = gensim.models.Word2Vec(Sentences, size = 100, min_count = 1, sg = 0, hs = 1)
model.save('D:/_Research/Project_Ecological_Development/Data_Processing/CBOW_HierSoftmax.model') #For our corpus, CBOW and HierSoftmax is better. This is the model used in the future processing

model = gensim.models.Word2Vec(Sentences, size = 100, min_count = 1, sg = 0, hs = 0)
model.save('D:/_Research/Project_Ecological_Development/Data_Processing/CBOW_NegativeSampling.model')

## 2. Cluster generation on ecological concepts & categories

### 2.1 Mannual preparation

The list of concept words are prepared mannually, each candidate concept will be deliverred into the trained Word2Vec model for similarity comparison, which will generate word cluster on particular ecological concept.

However, the mannual preparation of candidate words is critical but somehow vague. We state our algorithm of candidate selection here: 1) For unigram concepts, we directly deliver the unigram itself as candidate. 2) For bigram concepts, only slash divided unigram is deliverred, e.g. edge-effect. We can promise these concepts themselves are absolutely available in the word list, for we've already added all these candidates into wordlist in the last processing step. 3) For n-grams (n>=3), concept terms are further transformed to bigrams or even unigrams, e.g. 'population growth mode' are divided into population-growth and growth-mode, then treated as two bigrams.

For similarity comparison algorithm of concept cluster generation, it is calculated by model.wv.most_similar() in package gensim. This method computes cosine similarity between a simple mean of the projection weight vectors of the given words and the vectors for each word in the model. Further for quantification, we choose the parameter topN as 20 for future cluster formation, which means there are at least 20 candidate words for each concept to generate concept cluster. We aslo attached the word similarity value to the result as resident for mannual selection. Here we'll import our model and read man-made import words for clustering search.

### Attention: Ended at 16th Jan. 2018, there are still TWO mannual operations above, ONE as Concept Candidate Generation and the OTHER as Concept Cluster Definition.

In [3]:
#####################Word Clustering###########################################
model = gensim.models.Word2Vec.load('D:/_Research/Project_Ecological_Development/Data_Processing/CBOW_HierSoftmax.model')
Waiting_List = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_InputWords_20171129.csv', encoding = u'utf-8')

Search each word for similiar words in building appropriate candidate word groups for concepts.

In [9]:
Candidates = []
for words in Waiting_List.Input_Word:
    result = []
    temp = words.split('|')
    for word in temp:
        result.extend(model.wv.most_similar(word, topn = 20))
    #Sort the candidates by similarity
    result = sorted(np.array(result), key = lambda record: record[1], reverse = True)
    Candidates.append([str(item[0]) for item in result])

Save these candidates to local disk for mannual Concept Cluster Definition

In [None]:
#Print candidates for mannual filtering
Waiting_List.insert(3,'Candidates',Candidates)
Waiting_List.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Categlory_Candidates_20171129.csv', encoding = u'utf-8')

### 2.2 Data preparation and Concept(Category) Tagging
The corpus for concept tagging was changed through the pushing of the research of this project. Till 16th Jan. 2018, the throughout analysis of concepts during 1915 to 2015 has been restricted to article titles (TI) only, more precise research during 1991 to 2015 is set to titles and abstract(TI + AB). However, at code level, we'll tag all the data of both TI and TI+AB, the furhter research diverses will only affect the cut of result on time.

In [None]:
######################Selected Frequency#######################################
Waiting_List = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Selected_20171129.csv', encoding = u'utf-8')
Waiting_List = Waiting_List.drop('Unnamed: 0',axis = 1)
Sentences = ReloadCorpus('D:/_Research/Project_Ecological_Development/Data_Processing/Sentences_TiAb_MannualBiReplaced.txt')
Selected_Frequency = [item.split(',') for item in Waiting_List.Selected_20171129.fillna('')]

The Sentences original file is well devided into two parts:first half as titles (Complete) and the other half as abstracts (Absence possible).

In [None]:
#Figure the distribution of selected Concepts in the Total Corpus for 1991~2015 research
PaperLen = int(len(Sentences)/2)
Title_Abstract = []
for index in range(PaperLen):
    Title_Abstract.append(Sentences[index]+Sentences[index + PaperLen])
    
#The corpus of titles for 100 year statistics
Titles = Sentences[:PaperLen]

Using defined functions to count the existence of agents in both datasets(TI and TI + AB).
#### Attention: till 6th Jan. 2018, the counting function has been refined to the third version (much faster), which used matrix level logical reasoning and calculation. So the result will be in the form of bollean (True & False), DO NOT CHANGE IT INTO 0 and 1.

In [None]:
#Count the existence of agents in Titles + Abstracts
ConceptMatrix_TiAb = []
for Concept in Selected_Frequency:
    Matrix = [False for i in range(len(Title_Abstract))]
    for Word in Concept:
        Matrix = MatrixCombine(Matrix, PaperFreq(Word, Title_Abstract))
    ConceptMatrix_TiAb.append(Matrix)
    #print(Concept)

#Count the existence of agents in Titles
ConceptMatrix_Ti = []
for Concept in Selected_Frequency:
    Matrix = [False for i in range(len(Titles))]
    for Word in Concept:
        Matrix = MatrixCombine(Matrix, PaperFreq(Word, Titles))
    ConceptMatrix_Ti.append(Matrix)
    #print(Concept)

#Statistics
Total_TiAb = [sum(concept) for concept in ConceptMatrix_TiAb]
Total_Ti = [sum(concept) for concept in ConceptMatrix_Ti]

Transpose the result matrix and transform it into pd.DataFrame. Then export the result to Localdisk, with column names inverted.

In [None]:
#For matrix on TI+AB
ConInPaper_Matrix = pd.DataFrame(np.matrix(ConceptMatrix_TiAb).T, columns = Waiting_List.Concepts)
ConInPaper_Matrix.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_TiAb_20171129.csv', encoding = u'utf-8')
#For matrix on TI only
ConInPaper_Matrix = pd.DataFrame(np.matrix(ConceptMatrix_Ti).T, columns = Waiting_List.Concepts)
ConInPaper_Matrix.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_Ti_20171216.csv', encoding = u'utf-8')

### 2.3 Concept mapping and foundamental statistics

Mapping every concept and category based on the occurence matrix progressed in the last step. First, reload the data based on the request and insert the publication year into matrix.

In [None]:
#Publishment reload
Records = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Complete_Records_WOS_20171111.csv', encoding = u'utf-8')
Records = Records.drop('Unnamed: 0', axis = 1)
PubYear = np.array(Records.PY)
del(Records)

#For Ti + Ab Please read this line
ConInPaper_Matrix_TiAb = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_TiAb_20171129.csv', false_values = ['0'], encoding = u'utf-8')
#For Ti only Please read this line
ConInPaper_Matrix_Ti = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_Ti_20171216.csv', false_values = ['0'], encoding = u'utf-8')

ConInPaper_Matrix_TiAb.insert(0,'Year',PubYear)
ConInPaper_Matrix_Ti.insert(0,'Year',PubYear)

Group the matrix by year using built-in algorithm of pandas.

In [None]:
#Per_Year_Total calculation -----New
Per_Year_Total_TiAb = ConInPaper_Matrix_TiAb.groupby('Year').sum()
Per_Year_Total_Ti = ConInPaper_Matrix_Ti.groupby('Year').sum()

#Concept Coverage Calculation
ConInPaper_Matrix = ConInPaper_Matrix.drop('Year', axis = 1)
Concept_Coverage = [False for i in range(len(ConInPaper_Matrix))]
for concept in ConInPaper_Matrix:
    Concept_Coverage = MatrixCombine(Concept_Coverage, ConInPaper_Matrix[concept])
print(sum(Concept_Coverage))

In [None]:
#For Ti + Ab output
Per_Year_Total_TiAb.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_PerYear_TiAb_20171204.csv', encoding = u'utf-8')
#For Ti output
Per_Year_Total_Ti.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_PerYear_Ti_20171216.csv', encoding = u'utf-8')

After tagging and counting the absolute existence of concepts in both datasets, we'll try figure out the relative importance of each concepts. For relative importance, we'll devide the count of concepts each year with the amount of all concepts in that particular year.

While the matrix itself is not in the right direction, we'll tranpose it to calculation and tranpose back to output.

In [None]:
#PerYear_RI calculation --------- New --------- Will raise changes in Per_Year_Total
temp = Per_Year_Total.T
for line in temp:
    temp[line] = temp[line] / sum(temp[line])
Per_Year_RI = temp.T

#For Ti+Ab output
Per_Year_RI_TiAb.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_RI_TiAb_PerYear_20171205.csv', encoding = u'utf-8')
#For Ti output
Per_Year_RI_Ti.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_RI_Ti_PerYear_20171205.csv', encoding = u'utf-8')

### 2.4 Category mapping and statistics

Ecological cateogry is composed of different ecological concepts, which requires combination of matrix and cluster for further statistics. The original data is prepared in the Wating_List of mannual work.

In [None]:
#Figure the distribution of selected Categories in the corpus
Waiting_List = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Selected_20171129.csv', encoding = u'utf-8')
#For Ti + Ab Please read this line
ConInPaper_Matrix_TiAb = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_TiAb_20171129.csv', false_values = ['0'], encoding = u'utf-8')
#For Ti only Please read this line
ConInPaper_Matrix_Ti = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Concept_Paper_Ti_20171216.csv', false_values = ['0'], encoding = u'utf-8')

In [None]:
#Prepare the category dictionary
Categories = [item.split('|') for item in Waiting_List.Category]
for cate in Categories:
    try:
        cate.remove('')
    except:
        print('None')
Waiting_List.insert(0,'Categories',Categories)

temp = []
for item in Categories:
    temp += item
    
Cate_dict = {}
for cate in pd.value_counts(temp).index:
    Cate_dict.update({cate:[]})

for concept in Waiting_List.iterrows():
    for cate in concept[1].Categories:
        Cate_dict[cate].append(concept[1].Concepts)

#Generate Year_Category
Year_Category = pd.DataFrame()
for cate in Cate_dict:
    temp = [False for i in range(len(ConInPaper_Matrix))]
    print(cate)
    for concept in Cate_dict[cate]:
        temp = MatrixCombine(temp, ConInPaper_Matrix[concept])
    Year_Category.insert(0,cate,temp)
    
#For Ti + Ab output
Year_Category.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_Paper_TiAb_20171201.csv', encoding = u'utf-8')
#For Ti output
Year_Category.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_Paper_Ti_20171217.csv', encoding = u'utf-8')

Foundamental category based statistics, including absolute counting and relative importance, which is similar to concept statistics.

In [None]:
Records = pd.read_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Complete_Records_WOS_20171111.csv', encoding = u'utf-8')
Records = Records.drop('Unnamed: 0', axis = 1)
PubYear = np.array(Records.PY)
del(Records)

#Per_Year_Total calculation -----New
Category_Matrix.insert(0,'Year',PubYear)
Per_Year_Total = Category_Matrix.groupby('Year').sum()

#For Ti + Ab
Per_Year_Total.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_PerYear_TiAb_20171204.csv', encoding = u'utf-8')
#For Ti only
Per_Year_Total.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_PerYear_Ti_20171217.csv', encoding = u'utf-8')

#PerYear_RI calculation --------- New --------- Will raise changes in Per_Year_Total
temp = Per_Year_Total.T
for line in temp:
    temp[line] = temp[line] / sum(temp[line])
Per_Year_RI = temp.T

#For Ti + Ab
Per_Year_RI.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_RI_PerYear_TiAb_20171205.csv', encoding = u'utf-8')
#For Ti only
Per_Year_RI.to_csv('D:/_Research/Project_Ecological_Development/Data_Processing/Category_RI_PerYear_Ti_20171217.csv', encoding = u'utf-8')