In [26]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")

In [27]:
df_papers = pd.read_csv("papers_data.csv")
df_papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      2000 non-null   object
 1   summary    2000 non-null   object
 2   url        2000 non-null   object
 3   published  2000 non-null   object
 4   authors    2000 non-null   object
dtypes: object(5)
memory usage: 78.2+ KB


In [28]:
df_papers['description'] = df_papers.apply(lambda row: f"PAPER: {row['title']} [{row['published']}] | {row['summary']}", axis=1) 

In [29]:
df_repos = pd.read_csv("repo_data.csv")
df_repos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     100 non-null    object 
 1   description              94 non-null     object 
 2   url                      100 non-null    object 
 3   createdAt                100 non-null    object 
 4   updatedAt                100 non-null    object 
 5   diskUsage                100 non-null    int64  
 6   forkCount                100 non-null    int64  
 7   isArchived               100 non-null    bool   
 8   isFork                   100 non-null    bool   
 9   isMirror                 100 non-null    bool   
 10  isPrivate                100 non-null    bool   
 11  stargazers.totalCount    100 non-null    int64  
 12  languages.nodes          100 non-null    object 
 13  licenseInfo.name         73 non-null     object 
 14  owner.login              10

In [30]:
df_repos.rename(columns={
    'name': 'title',
    'description': 'summary',
}, inplace=True)

In [31]:
df_repos['description'] = df_repos.apply(lambda row: f"REPO: {row['title']} [{row['createdAt']}, stars: {row['stargazers.totalCount']}] | {row['summary']}", axis=1)

In [32]:
df_merged = pd.concat([df_papers, df_repos], ignore_index=True)
df_merged.sample(20)[['title', 'url', 'description']]

Unnamed: 0,title,url,description
1859,Bulk and film synthesis pathways to ternary ma...,http://arxiv.org/abs/2306.02233v1,PAPER: Bulk and film synthesis pathways to ter...
1086,Shear Viscosity expression for Graphene system...,http://arxiv.org/abs/2306.14747v2,PAPER: Shear Viscosity expression for Graphene...
1206,Global existence of 2D electron MHD near a ste...,http://arxiv.org/abs/2306.13036v1,PAPER: Global existence of 2D electron MHD nea...
622,Design and processing as ultrathin films of a ...,http://arxiv.org/abs/2307.04591v1,PAPER: Design and processing as ultrathin film...
839,"Strain, Young's modulus, and structural transi...",http://arxiv.org/abs/2307.01022v1,"PAPER: Strain, Young's modulus, and structural..."
213,Interaction-mitigated Landau damping,http://arxiv.org/abs/2307.11169v1,PAPER: Interaction-mitigated Landau damping [2...
469,Better sensing with variable-range interactions,http://arxiv.org/abs/2307.06901v1,PAPER: Better sensing with variable-range inte...
995,Efficient Reduction of Casimir Forces by Self-...,http://arxiv.org/abs/2306.16209v1,PAPER: Efficient Reduction of Casimir Forces b...
807,Electronic structure of the Ge/Si(105) hetero-...,http://arxiv.org/abs/2307.01604v1,PAPER: Electronic structure of the Ge/Si(105) ...
2046,geddit-app,https://github.com/kaangiray26/geddit-app,"REPO: geddit-app [2023-07-01T18:03:01Z, stars:..."


In [33]:
df_merged.shape

(2100, 24)

In [34]:
df_merged.to_csv("merged_data_full.csv", index=False)
df_merged[['title', 'url', 'description']].to_csv("merged_data.csv", index=False)