# Feature Extraction

### TODO

- Create the Tables for Authors, Venues
- Perform Feature Extractions
- Create the final table with the features

#### Adding the libraries

In [None]:
import pandas as pd
import matplotlib as plt
import ast
import numpy as np
import seaborn as sns
from statistics import mean

#### Adding the database

In [None]:
data_papers = pd.read_pickle('../Data/papers_clean.pkl')
data_papers.head()

In [None]:
data_papers['fos'] = data_papers.fos.apply(lambda x:  ast.literal_eval(x))
data_papers['authors'] = data_papers.authors.apply(lambda x:  ast.literal_eval(x))
data_papers['venue'] = data_papers.venue.apply(lambda x:  ast.literal_eval(x))

#### The list of features:

* Topic Rank
* Diversity
* Productivity
* H-index
* Author Rank
* Venue Rank
* Maximum Past Influence of Authors (Past Influence of Authors)
* Total Past Influence of Authors (Past Influence of Authors)
* Maximum Past Influence of Venues (Past Influence of Venues)
* Total Past Influence of Venues (Past Influence of Venues)
* Versatility
* Novelty
* Sociality
* Authority
* Venue Centrality
* First two years performance 
* Yearly citations 

#### The features that need merge (like multi authors) will be calculated during feature file creation 

#### Topic Rank

steps:

- take the fos of all rows
- take the unique values
- create a --- matrix where the columns are the topics and each row is a paper, if topic exist => 1, otherwise 0
- have a table with the topic as ID
- calculate the score for each ID ==> 
- third column should be the rank

INF(topic/d)= P(topic/d) * INF(paper)
score = sum(Weight of fos in Document * n_citation of that document) in all papers
rank = sort(score,ascending)

Table: topics (topic, score, rank)

##### forget for now

In [None]:
topics = pd.DataFrame(data_papers['fos'])
len(topics)
print(topics['fos'][0])
topics['fos'] = topics.fos.apply(lambda x: [i['name'] for i in x])
print(topics['fos'][0])
topic_names = topics['fos'].tolist()
topics = [ item for elem in topic_names for item in elem]
print(len(topics))
print(len(set(topics)))
from collections import Counter
for i in set(topics):
    print(topics.count(i), i)
#topics[0:26]

#### Diversity

steps:

- loop the FOS sum the Shannon index
- Shannon index = -w Ln (w)

The higher the value the more diversity; The lower the value the more focused the paper is

https://en.wikipedia.org/wiki/Diversity_index

https://en.wikipedia.org/wiki/Entropy_(information_theory)

Table: paper

- min value is 0.0
- max value is 4.966 (but can be higher)
- look up function: -i*ln(i) on Desmos
- max value is 0.3679 at 0.3679 

In [None]:
#diversity formula 
data_papers['diversity'] = data_papers['fos']
data_papers['diversity'] = data_papers.diversity.apply(lambda x: sum([-i['w']*np.log(i['w']) for i in x if i['w'] > 0.0]))

analyze the feature

In [None]:
print(data_papers['diversity'].head())
print(data_papers['diversity'].describe())
print(data_papers['n_citation'].corr(data_papers['diversity']))

#### Author table

##### Adding paper IDs and citation to the table

In [None]:
#adding author_id to paper table
data_papers['author_id'] = data_papers['authors']
data_papers['author_id'] = data_papers.author_id.apply(lambda x: [i['id'] for i in x])

#creating author table
data_authors = data_papers[['id','authors', 'n_citation', 'fos']].to_numpy()
data_authors = [{**j,"paper_ids":i[0],"n_citations":i[2],"FOS":i[3]} for i in data_authors for j in i[1]]
data_authors_df = pd.DataFrame(data_authors)

#Merging the duplicate authors (based on id)
data_authors_df1 = data_authors_df.groupby(['id'])['paper_ids'].apply(list).reset_index()
data_authors_df2 = data_authors_df.groupby(['id'])['n_citations'].apply(list).reset_index()
data_authors_df3 = data_authors_df.groupby(['id'])['name'].first().reset_index()
data_authors_df4 = data_authors_df.groupby(['id'])['org'].first().reset_index()
data_authors_df5 = data_authors_df.groupby(['id'])['FOS'].apply(list).reset_index()

data_authors_df = pd.merge(data_authors_df1, data_authors_df2, on = "id", how = "inner")
data_authors_df = pd.merge(data_authors_df, data_authors_df3, on = "id", how = "inner")
data_authors_df = pd.merge(data_authors_df, data_authors_df4, on = "id", how = "inner")
data_authors_df = pd.merge(data_authors_df, data_authors_df5, on = "id", how = "inner")

data_authors_df['FOS'] = data_authors_df.FOS.apply(lambda x: [j for i in x for j in i])

data_authors_df.info()
data_authors_df.head()

#### Venue table

##### Adding paper IDs and citation to the table

In [None]:
#adding venue raw to paper table
data_papers['venue_raw'] = data_papers['venue']
data_papers['venue_raw'] = data_papers.venue_raw.apply(lambda x: x['raw'])

#creating venue table
data_venues = data_papers[['id','venue', 'n_citation']].to_numpy()
data_venues = [{**i[1],"paper_ids":i[0],"n_citations":i[2]} for i in data_venues]
data_venues_df = pd.DataFrame(data_venues)

#Merging the duplicate authors (based on id)
data_venues_df1 = data_venues_df.groupby(['raw'])['paper_ids'].apply(list).reset_index()
data_venues_df2 = data_venues_df.groupby(['raw'])['n_citations'].apply(list).reset_index()
data_venues_df3 = data_venues_df.groupby(['raw'])['id'].first().reset_index()
data_venues_df4 = data_venues_df.groupby(['raw'])['type'].first().reset_index()

data_venues_df = pd.merge(data_venues_df1, data_venues_df2, on = 'raw', how = "inner")
data_venues_df = pd.merge(data_venues_df, data_venues_df3, on = "raw", how = "inner")
data_venues_df = pd.merge(data_venues_df, data_venues_df4, on = "raw", how = "inner")

data_venues_df.info()
data_venues_df.head()

#### Productivity 

steps (table):

- option 1:
    - each row: [{'name':"name1",'id':"id1", 'org':"org1"},{'name':"name2",'id':, 'org':""},...]
    - a = [['name1','id1','org1'],...]
    - a = list(set(a))
    - a = [['name1','id1','org1', [paper_id1,paper_id2,..]],...]

- option 2:
    - [ [['name1','id1','org1'],[paper_id1]], ...]

- option 3*:
    - loop the papers and make this:
    - {'id1':['name1','org1',[paper_id1,paper_id2,..]],..}

steps (feature):
- len(papers)
- calculate the final value during the feature file creation

Table: Author (id, name, org, papers, productivity)

In [None]:
#gets the size of the paper IDs
data_authors_df['productivity'] = data_authors_df['paper_ids']
data_authors_df['productivity'] = data_authors_df.productivity.apply(lambda x: len(x))
data_authors_df.head()

#### H-index 

steps:
- use the code from before
- calculate the final value during the feature file creation

Table: Author (id, name, org, papers, productivity, h_index)

In [None]:
# calculates the H index using formula
data_authors_df['H_index'] = data_authors_df['n_citations']
data_authors_df['H_index'] = data_authors_df.H_index.apply(lambda x: sum(j >= i + 1 for i, j in enumerate(sorted(list(x), reverse=True))))
data_authors_df.head()

#### Author Rank

steps:
- get citation of the papers the author has
- take average
- give rank 

Table: Author (id, name, org, papers, productivity, h_index, citations, ave_cite, author_rank)

In [None]:
#calculates the averge citations of the author and gives rank (descending)
data_authors_df['average_citations'] = data_authors_df.n_citations.apply(mean)
data_authors_df['author_rank'] = data_authors_df['average_citations'].rank(ascending = 0)
data_authors_df.head(20)

#### Venue Rank

steps (table):
- same way as author
- loop the papers and make this:
- {'id1':['raw1',[paper_id1,paper_id2,..]],..}


steps (feature):
- get citation of the papers the venue has
- take average
- give rank


table: Venue (id, raw (or name), papers(list of ids), citations, ave_citation, rank)

In [None]:
data_venues_df["ave_citation"] = data_venues_df.n_citations.apply(mean)
data_venues_df["venue_rank"] = data_venues_df["ave_citation"].rank(na_option='bottom', method='max', ascending = False)
data_venues_df.head(10)

#### Maximum Past Influence of Authors (Past Influence of Authors)

steps:
- max of citation

Table: Author (id, name, org, papers, productivity, h_index, citations, ave_cite, author_rank, author_MPI)

In [None]:
# gets the highest value in the citations 
data_authors_df['author_MPI'] = data_authors_df.n_citations.apply(max)
data_authors_df.head(20)

#### Total Past Influence of Authors (Past Influence of Authors)

steps:
- total citation

Table: Author (id, name, org, papers, productivity, h_index, citations, ave_cite, author_rank, author_MPI, author_TPI)

In [None]:
#sum of the citations
data_authors_df['author_TPI'] = data_authors_df.n_citations.apply(sum)
data_authors_df.head(20)

#### Maximum Past Influence of Venue (Past Influence of Venue)

steps:
- max of citation

table: Venue (id, raw (or name), papers(list of ids), citations, ave_citation, rank, venur_MPI)

In [None]:
data_venues_df["venue_MPI"] = data_venues_df.n_citations.apply(max)
data_venues_df.head(10)

#### Total Past Influence of Venue (Past Influence of Venue)

steps:
- total citation

table: Venue (id, raw (or name), papers(list of ids), citations, ave_citation, rank, venur_MPI, venur_TPI)

In [None]:
data_venues_df["venue_TPI"] = data_venues_df.n_citations.apply(sum)
data_venues_df.head(10)

#### Versatility 

steps:
- add column topics to table author (format: {'FOS1':average w, 'FOS2':average w, ...})
- go over the FOS of papers of author
    - if FOS in dic, value = value + w/productivity
    - if not, create new key with FOS name and value FOS w/productivity
- what we get per row {'FOS1':average w, 'FOS2':average w, ...}
- loop the FOS sum the Shannon index
- Shannon index = -w Ln (w)


Table: Author (id, name, org, papers, productivity, h_index, citations, ave_cite, author_rank, author_MPI, author_TPI, versatility)

In [None]:
#turns the list of list of dictionaries into list of dictionaries
data_authors_df['versatility'] = data_authors_df['FOS']
data_authors_df['FOS'] = data_authors_df.FOS.apply(lambda x: [{list(i.values())[0]:list(i.values())[1]} for i in x if list(i.values())[1] > 0.0])

#makes a unique list of all the topics with weights > 0.0
data_authors_df['versatility'] = data_authors_df.versatility.apply(lambda x: list(set([i['name'] for i in x if list(i.values())[1] > 0.0])))
#turns the list into dictionary where the key is the topic and the value is a list of the weights for that topic
data_authors_df['versatility'] = data_authors_df.apply(lambda x: {i:[list(j.values())[0] for j in x.FOS if list(j.keys())[0] == i] for i in x.versatility}, axis=1)
#gets the average of the weights of the topics
data_authors_df['versatility'] = data_authors_df.apply(lambda x: {i:sum(x.versatility[i])/x.productivity for i in x.versatility}, axis = 1)
#diversity or versatility formula
data_authors_df['versatility'] = data_authors_df.versatility.apply(lambda x: sum([-x[i]*np.log(x[i]) for i in x]))

data_authors_df.head(20)

### Making the feature set (model input)