In [8]:
import sys, os, time

In [9]:
import pandas as pd

In [10]:
# pID_year = pd.read_csv('pID_year.txt', header=None, names=['pid', 'year'])
pID_year = pd.read_csv('pID_year.txt', header=None, names=['pid', 'year'])
pID_year = pID_year.set_index('pid')

In [11]:
# from util.PajekFactory import PajekFactory

In [12]:
# start = time.time()
# pjk = PajekFactory()
# with open('jstor.net', 'r') as f:
#     vertices = {}
#     arcs = []
#     mode = ""
#     remove_idx = []
#     line_no = 0
#     for line in f:
#         line_no += 1
#         if line_no in [5e4, 1e5, 5e5] or (line_no % 1e6 == 0):
#             print("{} lines so far".format(line_no))
#         line = line.strip()
#         if line[0] == '*':
#             if line.lower().startswith('*vert'):
#                 mode = 'vertices'
#             if line.lower().startswith('*arc') or line.lower().startswith('*edg'):
#                 mode = 'arcs'
#                 print('vertices complete ({:.2f} seconds so far)'.format(time.time()-start))
#             continue
#         line = line.split(' ')
        
#         if mode == 'vertices':
#             idx = int(line[0])
#             pid = int(line[1].strip('"'))
#             if pid in remove.values:
#                 remove_idx.append(idx)
#             else:
#                 vertices[idx] = pid
                
#         elif mode == 'arcs':
#             remove_idx = pd.Series(remove_idx)
#             idx_out = int(line[0])
#             idx_in = int(line[1])
#             if (idx_out in remove_idx.values) or (idx_in in remove_idx.values):
#                 continue
#             else:
#                 pid_out = vertices[idx_out]
#                 pid_in = vertices[idx_in]
#                 pjk.add_edge(pid_out, pid_in)

# end = time.time()
# print("done {:.2f} seconds".format(end-start))

In [113]:
def parse_pajek(fname):
    vertices = {}
    arcs = []
    with open(fname, 'r') as f:
        mode = ""
        line_num = 0
        for line in f:
            line_num += 1
#             if line_num in [5e4, 1e5, 5e5] or (line_no % 1e6 == 0):
#                 print("{} lines so far".format(line_num))
            line = line.strip()
            if line[0] == '*':
                if line.lower().startswith('*vert'):
                    mode = 'vertices'
                if line.lower().startswith('*arc') or line.lower().startswith('*edg'):
                    mode = 'arcs'
                continue
            line = line.split(' ')

            if mode == 'vertices':
                idx = int(line[0])
                pid = int(line[1].strip('"'))
                vertices[idx] = pid

            elif mode == 'arcs':
                idx_out = int(line[0])
                idx_in = int(line[1])
                pid_out = vertices[idx_out]
                pid_in = vertices[idx_in]
                arcs.append( (pid_out, pid_in) )
    return arcs

In [114]:
def join_years(df, colname_citing='citing', colname_cited='cited'):
    df = df.join(pID_year.year, on=colname_citing).rename(columns={'year': 'citing_year'})
    df = df.join(pID_year.year, on=colname_cited).rename(columns={'year': 'cited_year'})
    return df

In [115]:
def trim_years(df, year_cutoff):
    trimmed = df[df.citing_year<=year_cutoff]
    trimmed = trimmed[trimmed.cited_year<year_cutoff]
    return trimmed

In [116]:
def get_vertices(edgelist_df):
    # returns a series with the proper index
    vertices = []
    vertices.extend(edgelist_df.citing.tolist())
    vertices.extend(edgelist_df.cited.tolist())
    vertices = pd.Series(vertices).drop_duplicates().reset_index(drop=True)
    # increment the index for one-based index (pajek convention)
    vertices.index += 1
    return vertices

def write_pajek(edgelist_df, fname):
    vertices = get_vertices(edgelist_df)
    pid_to_idx = {}
    for idx, pid in vertices.iteritems():
        pid_to_idx[pid] = idx
    citing = edgelist_df.citing.map(pid_to_idx)
    cited = edgelist_df.cited.map(pid_to_idx)
    with open(fname, 'w') as f:
        f.write('*vertices {}'.format(len(vertices)))
        f.write('\n')
        
        for idx, pid in vertices.iteritems():
            f.write('{} "{}"'.format(idx, pid))
            f.write('\n')
            pid_to_idx[pid] = idx
        citing = edgelist_df.citing.map(pid_to_idx)
        cited = edgelist_df.cited.map(pid_to_idx)
        
        f.write('*arcs {}'.format(edgelist_df.shape[0]))
        f.write('\n')
        for idx_out, idx_in in zip(citing, cited):
            f.write('{} {}'.format(idx_out, idx_in))
            f.write('\n')

In [96]:
arcs = parse_pajek('jstor.net')
df = pd.DataFrame(arcs, columns=['citing', 'cited'])
df = join_years(df)
df = trim_years(df, year_cutoff=1965)
trimmed_out = df[['citing', 'cited']].reset_index(drop=True)

vertices complete (13969.42 seconds so far)


In [97]:
df.shape

(571935, 4)

In [98]:
write_pajek(trimmed_out, 'test_jstor_trimmed_1965.net')

In [124]:
def trimmed_range_write_pajeks(infilename, 
                               start_year, 
                               end_year=None, 
                               step=5, 
                               outdir=os.path.abspath('.'), 
                               outf_base='trimmed'):
    arcs = parse_pajek(infilename)
    df = pd.DataFrame(arcs, columns=['citing', 'cited'])
    df = join_years(df)
    
    if not end_year:
        end_year = df.citing_year.max().astype(int)
    year_range = range(start_year, end_year, step)
    if year_range[-1] != end_year:
        year_range.append(end_year)
    year_range.sort(reverse=True)
    
    for year_cutoff in year_range:
        df = trim_years(df, year_cutoff)
        df_out = df[['citing', 'cited']].reset_index(drop=True)
        outfilename = "{}-{}.net".format(outf_base, year_cutoff)
        outpath = os.path.join(outdir, outfilename)
        write_pajek(df_out, outpath)

In [125]:
trimmed_range_write_pajeks('jstor.net', start_year=1965, end_year=1972, outf_base='test_jstor_trimmed')

KeyboardInterrupt: 

In [126]:
trimmed_range_write_pajeks('jstor.net', 
                           start_year=1930, 
                           outdir='trimmed_years', 
                           outf_base='jstor_trimmed')