# Feature Extraction

#### Adding the libraries

In [None]:
import pandas as pd
import matplotlib as plt
import ast
import numpy as np
from statistics import mean
import pickle

#### Adding the database

In [None]:
data_papers = pd.read_pickle('../Data/papers0_2010_clean_referenced.pkl')
data_papers.head()

#### The list of features:

* Diversity
* Productivity
* H-index
* Author Rank
* Venue Rank
* Maximum Past Influence of Authors (Past Influence of Authors)
* Total Past Influence of Authors (Past Influence of Authors)
* Maximum Past Influence of Venues (Past Influence of Venues)
* Total Past Influence of Venues (Past Influence of Venues)
* Versatility
* Novelty
* Sociality
* Authority
* Venue Centrality
* First two years performance 
* Yearly citations 

#### The features that need merge (like multi authors) will be calculated during feature file creation 

#### Diversity

In [None]:
# Diversity formula 
data_papers['diversity'] = data_papers['fos']
data_papers['diversity'] = data_papers.diversity.apply(lambda x: sum([-i['w']*np.log(i['w']) for i in x if i['w'] > 0.0]))

analyze the feature

In [None]:
print(data_papers['diversity'].head())
print(data_papers['diversity'].describe())
print(data_papers['n_citation'].corr(data_papers['diversity']))
print(data_papers['citation'].corr(data_papers['diversity']))
print(data_papers['citation_5yr'].corr(data_papers['diversity']))
print(data_papers['citation_10yr'].corr(data_papers['diversity']))

### Author table

##### Adding paper IDs and citation to the table

In [None]:
# Adding author_id to paper table
data_papers['author_id'] = data_papers['authors']
data_papers['author_id'] = data_papers.author_id.apply(lambda x: [i['id'] for i in x])

# Creating author table
data_authors = data_papers[['id','authors', 'citation', 'fos', 'year']].to_numpy()
data_authors = [{**j,'paper_ids':i[0],'n_citations':i[2],'FOS':i[3],'year':i[4]} for i in data_authors for j in i[1]]
data_authors_df = pd.DataFrame(data_authors)

data_authors_df = data_authors_df.groupby(['id']).agg(list)
data_authors_df.reset_index(inplace=True)
data_authors_df['id'] = data_authors_df['id'].astype(np.int64)

data_authors_df['FOS'] = data_authors_df.FOS.apply(lambda x: [j for i in x for j in i])

data_authors_df.info()
data_authors_df.head()

In [None]:
data_authors_df.info()
data_authors_df.head()

### Venue table

##### Adding paper IDs and citation to the table

In [None]:
# Adding venue raw to paper table
data_papers['venue_raw'] = data_papers['venue']
data_papers['venue_raw'] = data_papers.venue_raw.apply(lambda x: x['raw'])

# Creating venue table
data_venues = data_papers[['id','venue', 'n_citation']].to_numpy()
data_venues = [{**i[1],'paper_ids':i[0],'n_citations':i[2]} for i in data_venues]
data_venues_df = pd.DataFrame(data_venues)

# Merging the duplicate authors (based on id)
data_venues_df = data_venues_df.groupby(['raw']).agg(list)
data_venues_df.reset_index(inplace=True)
data_venues_df['id'] = data_venues_df['raw'].astype(np.int64)

data_venues_df.info()
data_venues_df.head()

#### Productivity 

In [None]:
# Gets the size of the paper IDs
data_authors_df['productivity'] = data_authors_df.paper_ids.apply(len)
data_authors_df.head()

#### H-index 

In [None]:
# Calculates the H index using formula
data_authors_df['H_index'] = data_authors_df.n_citations.apply(lambda x: sum(j >= i + 1 for i, j in enumerate(sorted(list(x), reverse=True))))
data_authors_df.head()

#### Author Rank

In [None]:
# Calculates the average citations of the author and gives rank (descending)
data_authors_df['average_citations'] = data_authors_df.n_citations.apply(mean)
data_authors_df['author_rank'] = data_authors_df['average_citations'].rank(ascending = False)
data_authors_df.head()

#### Venue Rank

In [None]:
data_venues_df['ave_citation'] = data_venues_df.n_citations.apply(mean)
data_venues_df['venue_rank'] = data_venues_df['ave_citation'].rank(ascending = False)
data_venues_df.head()

#### Maximum Past Influence of Authors (Past Influence of Authors)

In [None]:
# Returns the highest value in the citations 
data_authors_df['author_MPI'] = data_authors_df.n_citations.apply(max)
data_authors_df.head()

#### Total Past Influence of Authors (Past Influence of Authors)

In [None]:
# Sum of the citations
data_authors_df['author_TPI'] = data_authors_df.n_citations.apply(sum)
data_authors_df.head()

#### Maximum Past Influence of Venue (Past Influence of Venue)

In [None]:
data_venues_df['venue_MPI'] = data_venues_df.n_citations.apply(max)
data_venues_df.head()

#### Total Past Influence of Venue (Past Influence of Venue)

In [None]:
data_venues_df['venue_TPI'] = data_venues_df.n_citations.apply(sum)
data_venues_df.head()

#### Versatility 

In [None]:
# Turns the list of list of dictionaries into list of dictionaries
data_authors_df['versatility'] = data_authors_df['FOS']
data_authors_df['FOS'] = data_authors_df.FOS.apply(lambda x: [{list(i.values())[0]:list(i.values())[1]} for i in x if list(i.values())[1] > 0.0])

# Makes a unique list of all the topics with weights > 0.0
data_authors_df['versatility'] = data_authors_df.versatility.apply(lambda x: list(set([i['name'] for i in x if list(i.values())[1] > 0.0])))
# Turns the list into dictionary where the key is the topic and the value is a list of the weights for that topic
data_authors_df['versatility'] = data_authors_df.apply(lambda x: {i:[list(j.values())[0] for j in x.FOS if list(j.keys())[0] == i] for i in x.versatility}, axis=1)
# Gets the average of the weights of the topics
data_authors_df['versatility'] = data_authors_df.apply(lambda x: {i:sum(x.versatility[i])/x.productivity for i in x.versatility}, axis = 1)
# Diversity or versatility formula
data_authors_df['versatility'] = data_authors_df.versatility.apply(lambda x: sum([-x[i]*np.log(x[i]) for i in x]))

data_authors_df.head()

### Making the feature set (model input)

In [None]:
features_df = data_papers[['id','author_id','venue_raw', 'diversity','n_citation']].copy()

In [None]:
features_df2 = data_venues_df.copy()
features_df2.rename(columns={"raw":"venue_raw"},inplace=True)
features_df2 = features_df2.drop(['id','type','ave_citation','paper_ids','n_citations'],axis =1)
features_df = features_df.merge(features_df2, on = 'venue_raw', how = 'inner')

features_df.info()
features_df.head()

In [None]:
data_authors_df = data_authors_df.drop(columns=['n_citations','FOS', 'name', 'org', 'average_citations'])
data_authors_df = data_authors_df.explode('paper_ids')
data_authors_df = data_authors_df.groupby('paper_ids').agg(list)
data_authors_df.reset_index(inplace=True)
data_authors_df['paper_ids'] = data_authors_df['paper_ids'].astype(np.int64)

data_authors_df.info()
data_authors_df.head()

In [None]:
data_authors_df.rename(columns={"id":"idA"},inplace=True)
data_authors_df.rename(columns={"paper_ids":"id"},inplace=True)
features_df = features_df.merge(df, on = 'id', how = 'inner')
features_df['productivity'] = features_df.productivity.apply(mean)
features_df['H_index'] = features_df.H_index.apply(mean)
features_df['author_rank'] = features_df.author_rank.apply(mean)
features_df['author_MPI'] = features_df.author_MPI.apply(mean)
features_df['author_TPI'] = features_df.author_TPI.apply(mean)
features_df['versatility'] = features_df.versatility.apply(mean)
features_df = features_df[['id', 'diversity', 'venue_rank', 'venue_MPI', 'venue_TPI', 'productivity', 'H_index', 'author_rank', 'author_MPI', 'author_TPI', 'versatility', 'n_citation']]

features_df.info()
features_df.head()

In [None]:
path = '../Data/features_2000_2010.'
features_df.to_pickle(path + 'pkl')

In [None]:
path = '../Data/features_2000_2010.'
features_df.to_csv(path + 'csv')