In [211]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
import random
from datetime import timedelta

In [212]:
path = '/Users/danilakokin/Desktop/UPC/Semester2/SDM/SDM_Project/joined'

In [213]:
article_df = pd.read_csv('/Users/danilakokin/Desktop/UPC/Semester2/SDM/SDM_Project/joined/article.csv', low_memory=False)

In [214]:
article_df.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424247 entries, 0 to 3424246
Data columns (total 36 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Unnamed: 0             int64  
 1   article:ID             int64  
 2   author:string[]        object 
 3   author-aux:string      object 
 4   author-orcid:string[]  object 
 5   booktitle:string       object 
 6   cdate:date             object 
 7   cdrom:string           object 
 8   cite:string[]          object 
 9   cite-label:string[]    object 
 10  crossref:string        object 
 11  editor:string[]        object 
 12  editor-orcid:string[]  object 
 13  ee:string[]            object 
 14  ee-type:string[]       object 
 15  i:string[]             object 
 16  journal:string         object 
 17  key:string             object 
 18  mdate:date             object 
 19  month:string           object 
 20  note:string[]          object 
 21  note-label:string      object 
 22  note-type:string[]

In [215]:
def preprocess_articles(df):
    df['id'] = df['article:ID'].astype(str)
    df['authors'] = df['author:string[]'].apply(lambda x: x.split('|') if pd.notnull(x) else pd.NA)
    df['journal'] = df['journal:string'].astype(str).str.strip()
    df['date'] = df['mdate:date']
    df['title'] = df['title:string']
    df['volume'] = df['volume:string']
    df['year'] = df['year:int'].apply(lambda x: int(x) if pd.notnull(x) else pd.NA)
    
    df = df.dropna(subset=['author:string[]', 'journal:string', 'volume:string'])
    
    return df[['id', 'authors', 'journal', 'date', 'title', 'volume', 'year']].copy()

article_clean_df = preprocess_articles(article_df).sample(n=5000).copy()
article_clean_df

Unnamed: 0,id,authors,journal,date,title,volume,year
1300689,8300302,"[Hongye Su, Meng Zhang 0011, Pablo Borja 0001,...",IEEE Trans. Autom. Control.,2022-12-01,PID Passivity-Based Control of Port-Hamiltonia...,63,2018
323122,7320234,"[Aisha Khan, Faisal Khan, Hashim Khan, Sharif ...",Int. J. Cyber Behav. Psychol. Learn.,2022-06-09,Distress Tolerance in the Context of Emotional...,12,2022
928320,7925844,"[Bart Demoen, Phuong-Lan Nguyen]",Theory Pract. Log. Program.,2020-02-13,Representation sharing for Prolog.,13,2013
1788561,8789150,"[Hui Wei 0001, Luping Wang]",IEEE CAA J. Autom. Sinica,2020-10-26,Avoiding non-Manhattan obstacles based on proj...,7,2020
2977488,9979909,"[Derick Moreira Baum, Euclides C. Pinto Neto, ...",CoRR,2020-03-06,Trajectory-Based Urban Air Mobility (UAM) Oper...,abs/1908.08651,2019
...,...,...,...,...,...,...,...
1866599,8867188,"[Charlotte Rodriguez, Günter Leugering]",SIAM J. Control. Optim.,2021-01-09,Boundary Feedback Stabilization for the Intrin...,58,2020
954495,7954033,"[Diego Gutierrez, Fernando Navarro, Francisco ...",Vis. Comput.,2023-03-21,Interactive HDR lighting of dynamic participat...,25,2009
2578897,9579883,"[J. Jithish, Krishnashree Achuthan, Sriram San...",J. Ambient Intell. Humaniz. Comput.,2021-02-24,A Decision-centric approach for secure and ene...,12,2021
641851,7639245,[Wayne D. Gray],Top. Cogn. Sci.,2020-10-01,"Introduction to Volume 5, Issue 4 of",5,2013


In [216]:
def collect_values_to_set(df, column_name):
    unique_values_set = set()
    for row in df[column_name]:
        unique_values_set.update(row)
    return unique_values_set

In [217]:
article_authors_set = collect_values_to_set(article_clean_df, 'authors')

In [218]:
print(article_authors_set)

{'Nikos Athanasiou', 'Kimmo Kaski', 'Eike Neumann', 'Dingming Xie', 'Kourosh Khoshelham', 'Meng Wang 0017', 'Sunghyun Kim 0001', 'Claudio D. G. Linhares', 'Qingguo Li', 'Marcelo Wilson Furlan Matos Alves', 'Alan Blair', 'Edmund K. Burke', 'Sarah M. Keating', 'Daniele Cesarini', 'Neema Navai', 'Momen K. Yacoub', 'Xinyu Lin', 'Shi-Jim Yen', 'Nasir Ghani', 'Chun Yong Wang', 'Dong Xiao', 'John W. Sammon Jr.', 'Patricio Farrell', 'Ke Wang 0010', 'Yun Gu', 'T. Revathi', 'Gong Zhang 0002', 'Luc Lismont', 'Ruifeng Wu', 'Charles L. Epstein', 'David S. Wack', 'Kyungmi Chung', 'Louis B. Swartz', 'J. K. Aggarwal', 'Chia-Wei Hsiao', 'Sizhe Chen', 'Qingyu Yin', 'Lluís Godo', 'Ilan Rusnak', 'Wen-Hsing Kuo', 'A. Jefferson Offutt', 'Jia Di', 'Wenchao Gou', 'Siddhant Garg', 'Honglei Zhang', 'Beng Chin Ooi', 'Angela Colantonio', 'Yulan He 0001', 'Reinhard Brandstetter', 'Ronan Collobert', 'Hongyu Yu', 'Agneta H. Fischer', 'Ferdinando Cicalese', 'Panagiotis Botsinis', 'Hai Zhao', 'Hua Han', 'Christopher K

### AUTHOR ###

In [219]:
def read_custom_file(file_path, low_memory=False):
    df = pd.read_csv(file_path, sep=';', header=None, low_memory=low_memory)
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

authors_df = read_custom_file(f'{path}/author.csv')
authors_df.head(5)

Unnamed: 0,:ID,author:string
1,10597993,Manish Singh 0001
2,10597994,Tien-Tsin Wong
3,10597995,Sylvia C. Pont
4,10597996,Daniel C. Alexander
5,10597997,Kazuhiro Fukui


In [220]:
def random_date(start, end):
    start_date = pd.to_datetime(start)
    end_date = pd.to_datetime(end)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date.strftime('%Y-%m-%d')

In [221]:
def preprocess_authors(df):
    # Assign columns
    df['id'] = df[':ID'].astype(str)
    df['full_name'] = df['author:string']

    return df[['id', 'full_name']]

In [222]:
def filter_rows_by_values(df, column_name, values_set):
    if df[column_name].dtype != 'category' and df[column_name].nunique() / len(df) < 0.1:
        df[column_name] = pd.Categorical(df[column_name], categories=values_set, ordered=False)
    return df.loc[df[column_name].isin(values_set)]

In [223]:
authors_clean_df = preprocess_authors(authors_df)

In [224]:
authors_clean_df = filter_rows_by_values(authors_clean_df, 'full_name', article_authors_set)
authors_clean_df['birth_date'] = [random_date('1954-01-01', '1994-01-01') for _ in range(len(authors_clean_df))]
authors_clean_df.to_csv('authors_test.csv',index=False)

In [225]:
authors_clean_df.to_csv('authors_test.csv',index=False)

In [226]:
def make_mapping(df, key_column, value_column):
  return pd.Series(df[value_column].values, index=df[key_column]).to_dict()

author_to_id_map = make_mapping(authors_clean_df, 'full_name', 'id')
print(author_to_id_map)

{'J. K. Aggarwal': '10598002', 'Juergen Gall': '10598004', 'Jürgen Beyerer': '10598009', 'Szymon Rusinkiewicz': '10598010', 'Yu-Wing Tai': '10598021', 'Ying Nian Wu': '10598027', 'Tony Lindeberg': '10598036', 'Nick Barnes': '10598041', 'Srikumar Ramalingam': '10598065', 'Ashok Veeraraghavan': '10598096', 'Richard G. Baraniuk': '10598099', 'Gian Luca Foresti': '10598138', 'Ruzena Bajcsy': '10598188', 'Zhe Lin 0001': '10598194', 'Robert J. Woodham': '10598198', 'Reinoud Maex': '10598260', 'Gustavo Deco': '10598279', 'Barbara J. Knowlton': '10598370', 'Nicolas Brunel': '10598428', 'Joerg F. Hipp': '10598446', 'Edward W. Large': '10598542', 'Gaute T. Einevoll': '10598548', 'Ranu Jung': '10598553', 'Wei Wei': '10598566', 'Dagmar Waltemath': '10598574', 'Nicolas Le Novère': '10598575', 'Rosalyn J. Moran': '10598613', 'Nikos K. Logothetis': '10598617', 'Alex H. Williams': '10598688', 'Henning Hermjakob': '10598700', 'Hiroto Ogawa': '10598701', 'Michael Hucka': '10598710', 'Laurenz Wiskott': '

In [227]:
def create_columns_from_authors(df, author_col_name, reviewers_set):
    # Extract the core_author and co_authors
    df['core_author'] = df[author_col_name].apply(lambda authors: author_to_id_map[authors[0]] if authors else pd.NA)
    df['co_authors'] = df[author_col_name].apply(lambda authors: [author_to_id_map.get(a) for a in authors[1:]] if len(authors) > 1 else pd.NA)

    # Function to choose a reviewer not in the authors list
    def choose_reviewer(authors):
        return author_to_id_map[np.random.choice(list(reviewers_set.difference(authors)))]

    # Apply the function to each row
    df['reviewer'] = df[author_col_name].apply(choose_reviewer)

    return df

# Apply the function to your DataFrame
article_clean_df = create_columns_from_authors(article_clean_df, 'authors', article_authors_set)
article_clean_df.head()

Unnamed: 0,id,authors,journal,date,title,volume,year,core_author,co_authors,reviewer
1300689,8300302,"[Hongye Su, Meng Zhang 0011, Pablo Borja 0001,...",IEEE Trans. Autom. Control.,2022-12-01,PID Passivity-Based Control of Port-Hamiltonia...,63,2018,10810616,"[10822806, 10821362, 10810615, 10816698]",10609394
323122,7320234,"[Aisha Khan, Faisal Khan, Hashim Khan, Sharif ...",Int. J. Cyber Behav. Psychol. Learn.,2022-06-09,Distress Tolerance in the Context of Emotional...,12,2022,10966521,"[10615312, 11015514, 13785544]",12648433
928320,7925844,"[Bart Demoen, Phuong-Lan Nguyen]",Theory Pract. Log. Program.,2020-02-13,Representation sharing for Prolog.,13,2013,10640728,[11999078],10883687
1788561,8789150,"[Hui Wei 0001, Luping Wang]",IEEE CAA J. Autom. Sinica,2020-10-26,Avoiding non-Manhattan obstacles based on proj...,7,2020,10781875,[10747248],11669605
2977488,9979909,"[Derick Moreira Baum, Euclides C. Pinto Neto, ...",CoRR,2020-03-06,Trajectory-Based Urban Air Mobility (UAM) Oper...,abs/1908.08651,2019,12816082,"[11664447, 11019945, 11019944, 11019943]",13150223


In [228]:
article_clean_df = article_clean_df.drop('authors', axis=1)
article_clean_df.head()

Unnamed: 0,id,journal,date,title,volume,year,core_author,co_authors,reviewer
1300689,8300302,IEEE Trans. Autom. Control.,2022-12-01,PID Passivity-Based Control of Port-Hamiltonia...,63,2018,10810616,"[10822806, 10821362, 10810615, 10816698]",10609394
323122,7320234,Int. J. Cyber Behav. Psychol. Learn.,2022-06-09,Distress Tolerance in the Context of Emotional...,12,2022,10966521,"[10615312, 11015514, 13785544]",12648433
928320,7925844,Theory Pract. Log. Program.,2020-02-13,Representation sharing for Prolog.,13,2013,10640728,[11999078],10883687
1788561,8789150,IEEE CAA J. Autom. Sinica,2020-10-26,Avoiding non-Manhattan obstacles based on proj...,7,2020,10781875,[10747248],11669605
2977488,9979909,CoRR,2020-03-06,Trajectory-Based Urban Air Mobility (UAM) Oper...,abs/1908.08651,2019,12816082,"[11664447, 11019945, 11019944, 11019943]",13150223


In [229]:
article_set = set(article_clean_df['id'].tolist())

In [230]:
print(article_set)

{'7689171', '10308856', '9739504', '8557151', '7578455', '10086962', '10084747', '10091612', '9164643', '8937512', '7716068', '8429222', '7451365', '8424551', '8675917', '9339246', '8377402', '10427760', '10401399', '7061805', '8720515', '8088702', '9343843', '10403796', '8276329', '8289626', '8545371', '9911477', '9081754', '9612867', '7307145', '8903387', '7354175', '8946741', '9255708', '8676967', '8494934', '9064404', '8055261', '8720316', '7021903', '9961654', '8790762', '8001645', '10224772', '10283167', '8918355', '7856646', '9235922', '10381168', '8312297', '7742044', '8178812', '10374119', '7611713', '7264394', '8682030', '8231884', '7236296', '9634527', '8938210', '9516215', '9390728', '8730650', '9727966', '8808591', '9319979', '7622276', '8482850', '9002964', '8619458', '10045671', '8256615', '7228924', '8103518', '7135257', '7892572', '7842884', '7057966', '8196524', '9459993', '8274730', '7893895', '8338967', '8767563', '8424886', '8103899', '7875657', '7408096', '9947931

In [231]:
def create_citations_column(df, article_id_col, article_ids_set):
    def choose_citations(current_article_id, available_ids):
        possible_ids = available_ids.difference({current_article_id})
        num_citations = np.random.randint(1, 6)
        return list(np.random.choice(list(possible_ids), size=num_citations, replace=False))
    
    # Apply the function to each row to create the 'citations' column
    df['citations'] = df.apply(
        lambda row: choose_citations(row[article_id_col], article_ids_set),
        axis=1
    )
    return df

# Apply the function to your DataFrame
article_clean_df = create_citations_column(article_clean_df, 'id', article_set)

In [232]:
article_clean_df

Unnamed: 0,id,journal,date,title,volume,year,core_author,co_authors,reviewer,citations
1300689,8300302,IEEE Trans. Autom. Control.,2022-12-01,PID Passivity-Based Control of Port-Hamiltonia...,63,2018,10810616,"[10822806, 10821362, 10810615, 10816698]",10609394,"[8102316, 8338372, 7080557, 8395753]"
323122,7320234,Int. J. Cyber Behav. Psychol. Learn.,2022-06-09,Distress Tolerance in the Context of Emotional...,12,2022,10966521,"[10615312, 11015514, 13785544]",12648433,"[9612867, 9958867, 9241295]"
928320,7925844,Theory Pract. Log. Program.,2020-02-13,Representation sharing for Prolog.,13,2013,10640728,[11999078],10883687,"[10198249, 8399783]"
1788561,8789150,IEEE CAA J. Autom. Sinica,2020-10-26,Avoiding non-Manhattan obstacles based on proj...,7,2020,10781875,[10747248],11669605,[8580221]
2977488,9979909,CoRR,2020-03-06,Trajectory-Based Urban Air Mobility (UAM) Oper...,abs/1908.08651,2019,12816082,"[11664447, 11019945, 11019944, 11019943]",13150223,"[10212994, 9805794, 8476308, 9271225]"
...,...,...,...,...,...,...,...,...,...,...
1866599,8867188,SIAM J. Control. Optim.,2021-01-09,Boundary Feedback Stabilization for the Intrin...,58,2020,10828782,[10828784],12107781,"[8752480, 9044040, 9194553, 8479439, 9506369]"
954495,7954033,Vis. Comput.,2023-03-21,Interactive HDR lighting of dynamic participat...,25,2009,10934677,"[11427247, 10933485]",10900698,"[8115304, 8189366, 7236296, 9784019, 8891928]"
2578897,9579883,J. Ambient Intell. Humaniz. Comput.,2021-02-24,A Decision-centric approach for secure and ene...,12,2021,11583068,"[10608576, 10739091]",13587260,"[8107196, 8545371]"
641851,7639245,Top. Cogn. Sci.,2020-10-01,"Introduction to Volume 5, Issue 4 of",5,2013,11166396,,13698926,"[9533303, 8886009, 10034576, 9733753]"


In [233]:
keywords = [
    "artificial intelligence",
    "climate change",
    "sustainable development",
    "quantum computing",
    "genetic engineering",
    "machine learning",
    "cybersecurity",
    "public health",
    "renewable energy",
    "data analytics",
    "blockchain",
    "nanotechnology",
    "neural networks",
    "ecosystem services",
    "cancer research",
    "autonomous vehicles",
    "internet of things",
    "big data",
    "vaccine development",
    "bioinformatics",
    "virtual reality",
    "augmented reality",
    "deep learning",
    "biodiversity",
    "mental health",
    "smart cities",
    "robotics",
    "3D printing",
    "cloud computing",
    "CRISPR",
    "agritech",
    "clean technology",
    "material science",
    "photovoltaics",
    "drug discovery",
    "astrophysics",
    "oceanography",
    "glaciology",
    "sociology",
    "economics",
    "political science",
    "urban planning",
    "microbiology",
    "quantum mechanics",
    "biochemistry",
    "particle physics",
    "organic chemistry",
    "computational biology",
    "environmental justice",
    "sustainable agriculture",
    "water scarcity",
    "air pollution",
    "soil degradation",
    "renewable resources",
    "energy storage",
    "machine ethics",
    "digital humanities",
    "biomedical engineering",
    "forensic science",
    "epidemiology",
    "neuroscience",
    "cognitive science",
    "psychology",
    "anthropology",
    "linguistics",
    "education technology",
    "space exploration",
    "conservation biology",
    "green chemistry",
    "industrial automation",
    "wearable technology",
    "nutrition science",
    # mandatory keywords
    "data management", 
    "indexing", 
    "data modeling", 
    "big data", 
    "data processing", 
    "data storage",
    "data querying"
]

In [234]:
def select_random_keywords(keywords, min_keywords=4, max_keywords=8):
    num_to_select = random.randint(min_keywords, max_keywords)
    selected_keywords = random.sample(keywords, num_to_select)
    return selected_keywords

In [235]:
article_clean_df['keywords'] = article_clean_df.apply(lambda row: select_random_keywords(keywords), axis=1)
article_clean_df['abstract'] = "Abstract text"

In [236]:
article_clean_df

Unnamed: 0,id,journal,date,title,volume,year,core_author,co_authors,reviewer,citations,keywords,abstract
1300689,8300302,IEEE Trans. Autom. Control.,2022-12-01,PID Passivity-Based Control of Port-Hamiltonia...,63,2018,10810616,"[10822806, 10821362, 10810615, 10816698]",10609394,"[8102316, 8338372, 7080557, 8395753]","[data management, linguistics, soil degradatio...",Abstract text
323122,7320234,Int. J. Cyber Behav. Psychol. Learn.,2022-06-09,Distress Tolerance in the Context of Emotional...,12,2022,10966521,"[10615312, 11015514, 13785544]",12648433,"[9612867, 9958867, 9241295]","[nutrition science, data storage, 3D printing,...",Abstract text
928320,7925844,Theory Pract. Log. Program.,2020-02-13,Representation sharing for Prolog.,13,2013,10640728,[11999078],10883687,"[10198249, 8399783]","[clean technology, quantum computing, urban pl...",Abstract text
1788561,8789150,IEEE CAA J. Autom. Sinica,2020-10-26,Avoiding non-Manhattan obstacles based on proj...,7,2020,10781875,[10747248],11669605,[8580221],"[linguistics, astrophysics, neuroscience, econ...",Abstract text
2977488,9979909,CoRR,2020-03-06,Trajectory-Based Urban Air Mobility (UAM) Oper...,abs/1908.08651,2019,12816082,"[11664447, 11019945, 11019944, 11019943]",13150223,"[10212994, 9805794, 8476308, 9271225]","[blockchain, glaciology, virtual reality, indu...",Abstract text
...,...,...,...,...,...,...,...,...,...,...,...,...
1866599,8867188,SIAM J. Control. Optim.,2021-01-09,Boundary Feedback Stabilization for the Intrin...,58,2020,10828782,[10828784],12107781,"[8752480, 9044040, 9194553, 8479439, 9506369]","[data storage, drug discovery, economics, poli...",Abstract text
954495,7954033,Vis. Comput.,2023-03-21,Interactive HDR lighting of dynamic participat...,25,2009,10934677,"[11427247, 10933485]",10900698,"[8115304, 8189366, 7236296, 9784019, 8891928]","[industrial automation, data querying, urban p...",Abstract text
2578897,9579883,J. Ambient Intell. Humaniz. Comput.,2021-02-24,A Decision-centric approach for secure and ene...,12,2021,11583068,"[10608576, 10739091]",13587260,"[8107196, 8545371]","[epidemiology, machine ethics, economics, bioi...",Abstract text
641851,7639245,Top. Cogn. Sci.,2020-10-01,"Introduction to Volume 5, Issue 4 of",5,2013,11166396,,13698926,"[9533303, 8886009, 10034576, 9733753]","[climate change, oceanography, astrophysics, n...",Abstract text


### JOURNAL ###

In [237]:
def read_custom_file(file_path, low_memory=False):
    df = pd.read_csv(file_path, sep=';', header=None, low_memory=low_memory)
    df.columns = df.iloc[0]
    df = df.drop(df.index[0])
    return df

journal_df = read_custom_file(f'{path}/journal.csv')

journal_df['id'] = journal_df[':ID']
journal_df['name'] = journal_df['journal:string']

journal_df = journal_df.drop(columns=[':ID', 'journal:string'])

In [238]:
journal_to_id_map = make_mapping(journal_df,'name', 'id')
print(journal_to_id_map)

{'EAI Endorsed Trans. Ubiquitous Environ.': '14154889', 'SIGMOD Record': '14154890', 'SIGMOD Rec.': '14154891', 'World Wide Web': '14154892', 'Int. J. Learn. Technol.': '14154893', 'IEEE Trans. Educ.': '14154894', 'Int. J. Neural Syst.': '14154895', 'Comput. Optim. Appl.': '14154896', 'IEEE Ann. Hist. Comput.': '14154897', 'Inf. Syst. Frontiers': '14154898', 'SIGIR Forum': '14154899', 'Dagstuhl Manifestos': '14154900', 'ICT Express': '14154901', 'Int. J. Inf. Technol. Proj. Manag.': '14154902', 'Electron. Notes Theor. Comput. Sci.': '14154903', 'SIGHIT Rec.': '14154904', 'EURASIP J. Bioinform. Syst. Biol.': '14154905', 'J. Inform. and Commun. Convergence Engineering': '14154906', 'IEEE Trans. Neural Networks Learn. Syst.': '14154907', 'IEEE Trans. Neural Networks': '14154908', 'Comput. Networks': '14154909', 'Comput. Networks ISDN Syst.': '14154910', 'SIAM J. Appl. Dyn. Syst.': '14154911', 'Comput. Informatics': '14154912', 'Comput. Artif. Intell.': '14154913', 'Computers and Artificia

In [239]:
journal_df.head()

Unnamed: 0,id,name
1,14154889,EAI Endorsed Trans. Ubiquitous Environ.
2,14154890,SIGMOD Record
3,14154891,SIGMOD Rec.
4,14154892,World Wide Web
5,14154893,Int. J. Learn. Technol.


In [240]:
article_clean_df['journal'] = article_clean_df['journal'].apply(lambda x: journal_to_id_map[x])

In [241]:
article_clean_df.to_csv('article_test.csv', index=False)

In [242]:
journal_set = set(article_clean_df['journal'].tolist())
print(journal_set)

{'14156086', '14155795', '14155217', '14155851', '14156871', '14156268', '14156007', '14156724', '14156042', '14155082', '14154963', '14155163', '14156445', '14156260', '14156050', '14155054', '14155375', '14155876', '14156419', '14155437', '14155680', '14156761', '14156329', '14155539', '14155369', '14156199', '14155133', '14155140', '14155557', '14156399', '14155579', '14156868', '14156695', '14155592', '14156447', '14155120', '14156561', '14155757', '14155549', '14155058', '14155866', '14155363', '14156049', '14155238', '14155379', '14155711', '14156326', '14156526', '14155440', '14155193', '14155008', '14156428', '14156870', '14155130', '14156498', '14156666', '14156503', '14155891', '14156874', '14155612', '14156280', '14156773', '14156335', '14155366', '14156656', '14155179', '14156813', '14156273', '14155639', '14155051', '14155180', '14155922', '14156591', '14154946', '14156869', '14155906', '14155742', '14156421', '14155590', '14156527', '14155703', '14156156', '14155002', '14

In [243]:
journal_df = filter_rows_by_values(journal_df, 'id', journal_set)
journal_df.shape

(1079, 2)

In [244]:
journal_df.to_csv('journal_test.csv', index=False)