In [2]:
import pandas as pd
import os


In [3]:
def get_file_name(dataFrameName, path):
    filePath = '%s/project_data/%s.parquet' % (path,dataFrameName)
    
    return filePath

In [4]:
def read_df(dataFrameName):
    path = os.getcwd()
    filePath = get_file_name(dataFrameName,path)
    df = pd.read_parquet(filePath,engine='auto')
    return df

In [21]:
def get_subset(df,year: int, stopYear: int):
    '''
    This function subsets the dataframe df
    by year

    inputs: 
    df: pandas dataframe
    year: int

    output:
    subset_df: pandas dataframe 
    with entries with date 'year-M-D'
    '''
    try:
        df['date'] = pd.to_datetime(df['update_date'], format='%Y-%m-%d')
    except:
        df['date'] = pd.to_datetime(df['update_date'], format='%Y')

    startDate = '{}-01-01'.format(str(year))
    
    endDate = '{}-01-01'.format(str(stopYear))
    
    subset_df = df.loc[(df['date'] >= startDate)
                    & (df['date'] < endDate)]
    
    if subset_df.empty:
        print('DataFrame is empty for year {}!'.format(str(year)))
    
    return subset_df

In [34]:
def get_subset_from_file_name(dataFrameName: str='arxiv_climate_change',year: int=2010,endYear:int = 2011):
    '''
    This function subsets the dataframe named
    dataFrameName by year

    inputs: 
    dataFrameName: name of the dataframe, string 
    year: int

    output:
    subset_df: pandas dataframe 
    with entries with date 'year-M-D'
    '''
    df = read_df(dataFrameName)

    subset_df = get_subset(df,year,endYear)
    
    return subset_df

In [30]:
def save_subset_dfs(subset_df,dataFrameName,year):

    '''
    This function saves the subset dataframes
    in the folder subset_data

    inputs: 
    subset_df: dataframe
    dataFrameName: name of the dataframe, string 
    year: int

    ! in progress
    '''

    path = os.getcwd()
    # Check whether the subset_data folder exists
    isExist = os.path.exists('{}/subset_data'.format(path))
    if not isExist:

    # Create the new folder because it does not exist
        os.makedirs(path)
        os.mkdir('{}/subset_data'.format(path))


    parquetFileName = '{}{}_{}.parquet'.format('subset_data/',dataFrameName,str(year))

    # 



    subset_df.to_parquet(parquetFileName,engine='auto')

    # subset_df.to_parquet('{}_{}.parquet'.format(dataFrameName,str(year)),engine='auto')

In [20]:

df = read_df('arxiv_climate_change')


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed,main_category,subcategory,top5_other,update_range
0,706.3621,Gerald Marsh,Gerald E. Marsh,Climate Change: The Sun's Role,"16 pages, 8 figures, 1 table",,,,physics.gen-ph,,The sun's role in the earth's recent warming...,"[{'created': 'Sat, 23 Jun 2007 18:43:00 GMT', ...",2010,"[[Marsh, Gerald E., ]]",physics,gen-ph,physics,2007-2012
1,706.372,James Hansen,J. Hansen (Columbia Univ. Earth Institute),How Can We Avert Dangerous Climate Change?,"18 pages, 7 figures; revised and expanded from...",,,,physics.ao-ph,,Recent analyses indicate that the amount of ...,"[{'created': 'Mon, 25 Jun 2007 23:59:51 GMT', ...",2007,"[[Hansen, J., , Columbia Univ. Earth Institute]]",physics,ao-ph,physics,2007-2012
2,706.4294,Terry Sloan,"T.Sloan, A W Wolfendale",Cosmic Rays and Global Warming,Submitted to ICRC 2007,,,,astro-ph physics.ao-ph physics.geo-ph physics....,,It has been claimed by others that observed ...,"[{'created': 'Thu, 28 Jun 2007 18:23:47 GMT', ...",2007,"[[Sloan, T., ], [Wolfendale, A W, ]]",astro-ph physics,ao-ph physics,other,2007-2012
3,803.1239,Serge Galam,Serge Galam,Global Warming: the Sacrificial Temptation,"14 pages, no figure",,,,physics.soc-ph physics.pop-ph,http://arxiv.org/licenses/nonexclusive-distrib...,The claimed unanimity of the scientific comm...,"[{'created': 'Mon, 10 Mar 2008 13:48:51 GMT', ...",2008,"[[Galam, Serge, ]]",physics,soc-ph physics,physics,2007-2012
4,804.3319,Ignacio Gallo,"Federico Gallo, Pierluigi Contucci, Adam Coutt...",Tackling climate change through energy efficie...,,,,,physics.soc-ph,http://arxiv.org/licenses/nonexclusive-distrib...,Promoting and increasing energy efficiency i...,"[{'created': 'Mon, 21 Apr 2008 17:11:46 GMT', ...",2009,"[[Gallo, Federico, ], [Contucci, Pierluigi, ],...",physics,soc-ph,physics,2007-2012


In [35]:
df = read_df('arxiv_nlp')

In [38]:
dataFrameName = 'nlp'
year = 2007
subset_df = get_subset(df,str(year),'2013')
save_subset_dfs(subset_df,dataFrameName,year)

In [39]:
df = pd.read_parquet('subset_data/climate_change_2013.parquet',engine='auto')

In [40]:
df.describe()

Unnamed: 0,update_date,date
count,277.0,277
mean,2015.306859,2015-04-23 06:03:53.935017984
min,2013.0,2013-01-01 00:00:00
25%,2014.0,2014-01-01 00:00:00
50%,2015.0,2015-01-01 00:00:00
75%,2016.0,2016-01-01 00:00:00
max,2017.0,2017-01-01 00:00:00
std,1.317272,
