In [1]:
import pandas as pd

from IPython.display import display
from utils import read_paper_contents

from data_pipeline import PreprocessorPipeline, TimeseriesFeatureEngineerPipeline, CitedCountFeatureEngineerPipeline, SharedCountProbabilityFeatureEngineerPipeline

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
metadata_df = pd.read_csv("data/papers_metadata.csv")
sample_submission_df = pd.read_csv("data/sample_submission.csv")

display(train_df.head())
display(test_df.head())
display(sample_submission_df.head())
display(metadata_df.head())


Unnamed: 0,paper,referenced_paper,is_referenced
0,p2128,p3728,0
1,p0389,p3811,0
2,p1298,p3760,0
3,p0211,p1808,0
4,p0843,p2964,0


Unnamed: 0,id,paper,referenced_paper
0,0,p0913,p3488
1,1,p2971,p4337
2,2,p2237,p1610
3,3,p2876,p3212
4,4,p2939,p1901


Unnamed: 0,id,is_referenced
0,0,-1
1,1,-1
2,2,-1
3,3,-1
4,4,-1


Unnamed: 0,paper_id,doi,title,publication_year,publication_date,cited_by_count,type,authors,concepts
0,p0000,https://doi.org/10.1161/circulationaha.115.001593,Machine Learning in Medicine,2015,11/16/2015,2662,review,Rahul C. Deo,Medicine; Medical physics; Medical education; ...
1,p0001,https://doi.org/10.1504/ijmmno.2013.055204,A literature survey of benchmark functions for...,2013,1/1/2013,1138,article,Momin Jamil; Xin‐She Yang,Benchmark (surveying); Set (abstract data type...
2,p0002,https://doi.org/10.1109/icip.2017.8296547,Abnormal event detection in videos using gener...,2017,9/1/2017,486,article,Mahdyar Ravanbakhsh; Moin Nabi; Enver Sanginet...,Abnormality; Computer science; Artificial inte...
3,p0003,https://doi.org/10.3115/v1/p15-1001,On Using Very Large Target Vocabulary for Neur...,2015,1/1/2015,916,article,Sébastien Jean; Kyunghyun Cho; Roland Memisevi...,Machine translation; Computer science; Vocabul...
4,p0004,https://doi.org/10.1109/tpami.2007.1167,Gaussian Process Dynamical Models for Human Mo...,2007,12/20/2007,1016,article,Jonathan M. Wang; David J. Fleet; Aaron Hertzmann,Gaussian process; Artificial intelligence; Lat...


In [3]:
paper_contents = read_paper_contents("data/Paper Database/Paper Database")

Successfully read file p0000.txt
Successfully read file p0001.txt
Successfully read file p0002.txt
Successfully read file p0003.txt
Successfully read file p0004.txt
Successfully read file p0005.txt
Successfully read file p0006.txt
Successfully read file p0007.txt
Successfully read file p0008.txt
Successfully read file p0009.txt
Successfully read file p0010.txt
Successfully read file p0011.txt
Successfully read file p0012.txt
Successfully read file p0013.txt
Successfully read file p0014.txt
Successfully read file p0015.txt
Successfully read file p0016.txt
Successfully read file p0017.txt
Successfully read file p0018.txt
Successfully read file p0019.txt
Successfully read file p0020.txt
Successfully read file p0021.txt
Successfully read file p0022.txt
Successfully read file p0023.txt
Successfully read file p0024.txt
Successfully read file p0025.txt
Successfully read file p0026.txt
Successfully read file p0027.txt
Successfully read file p0028.txt
Successfully read file p0029.txt
Successful

In [4]:
FeatureEngineerTransformer = ColumnTransformer([
    ("timeseries_feature_engineer", TimeseriesFeatureEngineerPipeline(), ["publication_year_original", "publication_year_referenced", "publication_date_original", "publication_date_referenced"]),
    ("cited_count_feature_engineer", CitedCountFeatureEngineerPipeline(), ["cited_by_count_original", "cited_by_count_referenced"]),
], remainder="passthrough").set_output(transform="pandas") 

preprocessor = Pipeline([
    ("preprocessor", PreprocessorPipeline(metadata_df)),
    ("feature_engineer", FeatureEngineerTransformer),
])

preprocessor.fit_transform(train_df)


Unnamed: 0,timeseries_feature_engineer__year_difference,timeseries_feature_engineer__is_original_before_referenced,timeseries_feature_engineer__positive_year_difference,cited_count_feature_engineer__cited_by_count_difference,cited_count_feature_engineer__positive_cited_by_count_difference,remainder__paper,remainder__referenced_paper,remainder__is_referenced,remainder__doi_original,remainder__title_original,remainder__type_original,remainder__authors_original,remainder__concepts_original,remainder__doi_referenced,remainder__title_referenced,remainder__type_referenced,remainder__authors_referenced,remainder__concepts_referenced
0,3,1,1,-2135,0,p2128,p3728,0,https://doi.org/10.18653/v1/2021.findings-acl.84,A Survey of Data Augmentation Approaches for NLP,article,Steven Y. Feng; Varun Gangal; Jason Wei; Sarat...,Computer science; Popularity; Artificial intel...,https://doi.org/10.1137/16m1080173,Optimization Methods for Large-Scale Machine L...,article,Léon Bottou; Frank E. Curtis; Jorge Nocedal,Computer science; Machine learning; Artificial...
1,-24,0,0,-97,0,p0389,p3811,0,https://doi.org/10.1016/b978-1-55860-377-6.500...,Residual Algorithms: Reinforcement Learning wi...,book-chapter,Leemon C. Baird,Residual; Algorithm; Reinforcement learning; C...,https://doi.org/10.1109/cvpr.2019.00447,Filter Pruning via Geometric Median for Deep C...,article,Yang He; Ping Liu; Ziwei Wang; Zhilan Hu; Yi Yang,FLOPS; Computer science; Convolutional neural ...
2,2,1,1,16073,1,p1298,p3760,0,https://doi.org/10.1109/tpami.2016.2644615,SegNet: A Deep Convolutional Encoder-Decoder A...,article,Vijay Badrinarayanan; A. C. Kendall; Roberto C...,Computer science; Artificial intelligence; Ups...,https://doi.org/10.1002/pmic.201500396,Integrative methods for analyzing big data in ...,review,Vladimir Gligorijević; Noël Malod‐Dognin; Nata...,Big data; Data science; Precision medicine; Re...
3,7,1,1,17162,1,p0211,p1808,0,https://doi.org/10.1109/tpami.2017.2699184,DeepLab: Semantic Image Segmentation with Deep...,article,Liang-Chieh Chen; George Papandreou; Iasonas K...,Conditional random field; Artificial intellige...,https://doi.org/10.1609/aimag.v31i3.2303,Building Watson: An Overview of the DeepQA Pro...,article,David Ferrucci; Eric W. Brown; Jennifer Chu‐Ca...,Watson; Champion; IBM; Computer science; Archi...
4,26,1,1,154,1,p0843,p2964,0,https://doi.org/10.1007/s11831-021-09694-4,Particle Swarm Optimization Algorithm and Its ...,review,Ahmed G. Gad,Particle swarm optimization; Swarm intelligenc...,https://doi.org/10.1007/bf00114723,Linear Least-Squares algorithms for temporal d...,article,Steven J. Bradtke; Andrew G. Barto,Recursive least squares filter; Algorithm; Tem...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410686,-20,0,0,746,1,p3478,p2966,0,https://doi.org/10.1016/0031-3203(95)00169-7,A survey on evaluation methods for image segme...,article,Y.J. Zhang,Goodness of fit; Evaluation methods; Computer ...,https://doi.org/10.1609/aaai.v30i1.10465,Reading Scene Text in Deep Convolutional Seque...,article,Pan He; Weilin Huang; Yu Qiao; Chen Change Loy...,Computer science; Artificial intelligence; Con...
410687,5,1,1,-1292,0,p0719,p2382,1,https://doi.org/10.3758/s13423-017-1343-3,Bayesian inference for psychology. Part I: The...,article,Eric‐Jan Wagenmakers; Maarten Marsman; Tahira ...,Bayesian probability; Bayesian statistics; Bay...,https://doi.org/10.1038/483531a,Raise standards for preclinical cancer research,article,C. Glenn Begley; Lee M. Ellis,Cancer; Computational biology; Medicine; Biolo...
410688,-3,0,0,2052,1,p1805,p3209,0,https://doi.org/10.1186/s12880-015-0068-x,Metrics for evaluating 3D medical image segmen...,article,Abdel Aziz Taha; Allan Hanbury,Computer science; Segmentation; Metric (unit);...,https://doi.org/10.18653/v1/w18-6557,E2E NLG Challenge: Neural Models vs. Templates,article,Yevgeniy Puzikov; Iryna Gurevych,Computer science; Task (project management); T...
410689,2,1,1,587,1,p4213,p0457,0,https://doi.org/10.1109/access.2020.2994762,CovidGAN: Data Augmentation Using Auxiliary Cl...,article,Abdul Waheed; Muskan Goyal; Deepak Gupta; Ashi...,Coronavirus disease 2019 (COVID-19); Convoluti...,https://doi.org/10.18653/v1/w18-6478,Dual Conditional Cross-Entropy Filtering of No...,article,Marcin Junczys-Dowmunt,Computer science; Artificial intelligence; Mac...


In [5]:
preprocessor

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).

