In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import re
import os
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from itertools import combinations
import matplotlib.cm as cm
from collections import Counter
import ast


In [2]:
df = pd.read_csv('merged_new.csv')

num_parts = 10
rows_per_part = len(df) // num_parts

# Split the DataFrame into 10 parts
for i in range(num_parts):
    start_idx = i * rows_per_part
    if i == num_parts - 1:  # Ensure the last part includes any remaining rows
        end_idx = len(df)
    else:
        end_idx = (i + 1) * rows_per_part
    
    part_df = df.iloc[start_idx:end_idx]
    part_df.to_csv(f'part_{i + 1}.csv', index=False)

print("CSV file has been split into 10 parts.")

CSV file has been split into 10 parts.


In [10]:
def explode(csv):
  df = pd.read_csv(f'{csv}.csv')
  df['AuthorID'] = df['AuthorID'].apply(lambda x: ast.literal_eval(x) if x else [])
  exploded_df = df.explode('AuthorID')
  author_df = exploded_df[['AuthorID', 'PaperID', 'FieldID']]
  author_df.to_csv(f'exploded_{csv}.csv', index=False)


In [11]:
csvlist = ['part_1', 'part_2', 'part_3', 'part_4', 'part_5', 'part_6','part_7', 'part_8', 'part_9', 'part_10']

In [12]:
for csv in csvlist:
  explode(csv)
  print(f'exploded_{csv} is completed')


exploded_part_1 is completed
exploded_part_2 is completed
exploded_part_3 is completed
exploded_part_4 is completed
exploded_part_5 is completed
exploded_part_6 is completed
exploded_part_7 is completed
exploded_part_8 is completed
exploded_part_9 is completed
exploded_part_10 is completed


In [13]:
exploded_df1 = pd.read_csv('exploded_part_1.csv')
print(exploded_df1.head)

<bound method NDFrame.head of             AuthorID     PaperID                FieldID
0         1243978490          23  [71924100, 177713679]
1         2582258949          23  [71924100, 177713679]
2         2582736345          23  [71924100, 177713679]
3         2662843304          79  [205649164, 45355965]
4         2683440697          79  [205649164, 45355965]
...              ...         ...                    ...
28832158  2578344502  1636162662  [41008148, 107457646]
28832159  2681880875  1636162662  [41008148, 107457646]
28832160  2104107278  1636162709  [71924100, 159047783]
28832161  2171156164  1636162745   [86803240, 90856448]
28832162  3168820795  1636162745   [86803240, 90856448]

[28832163 rows x 3 columns]>


In [14]:
exploded_df2 = pd.read_csv('exploded_part_2.csv')
print(exploded_df2.head)

<bound method NDFrame.head of             AuthorID     PaperID                           FieldID
0           26757109  1636162927              [41008148, 11413529]
1         2009271732  1636162927              [41008148, 11413529]
2         2140267349  1636162927              [41008148, 11413529]
3         2023216962  1636162975             [121332964, 62520636]
4         2143010845  1636162975             [121332964, 62520636]
...              ...         ...                               ...
39571701  2494006817  2002986680             [205649164, 91375879]
39571702  2659150923  2002986680             [205649164, 91375879]
39571703  2676403624  2002986685             [95457728, 107993555]
39571704  2165009231  2002986686  [205649164, 505870484, 18903297]
39571705  2649510438  2002986686  [205649164, 505870484, 18903297]

[39571706 rows x 3 columns]>


In [15]:
print(type(exploded_df1['FieldID'][0]))

<class 'str'>


In [16]:
exploded_df1['FieldID'] = exploded_df1['FieldID'].apply(lambda x: ast.literal_eval(x) if x else [])

In [17]:
grouped_df1 = exploded_df1.groupby('AuthorID').agg({
    'PaperID': list,
    'FieldID': lambda x: list(item for sublist in x for item in sublist)
}).reset_index()
print(grouped_df1.head)

<bound method NDFrame.head of             AuthorID                  PaperID  \
0               1968             [1527357417]   
1               3809              [658362731]   
2               4242              [647400699]   
3               7762              [623108090]   
4               7829  [328324571, 1521076585]   
...              ...                      ...   
15917377  3217809992              [573048486]   
15917378  3217810022             [1504204380]   
15917379  3217810146             [1146972988]   
15917380  3217810245             [1595719892]   
15917381  3217810830             [1507910090]   

                                             FieldID  
0                   [144024400, 15708023, 199539241]  
1                              [127413603, 42475967]  
2                              [142362112, 52119013]  
3                                [17744445, 3116431]  
4         [205649164, 166957645, 95457728, 15708023]  
...                                              ..

In [18]:
grouped_df1.to_csv('grouped_exploded_part_1.csv', index=False)

In [23]:
def group(csv):
  df = pd.read_csv(f'{csv}.csv')
  df['FieldID'] = df['FieldID'].apply(lambda x: ast.literal_eval(x) if x else [])
  grouped_df = df.groupby('AuthorID').agg({
    'PaperID': list,
    'FieldID': lambda x: list(item for sublist in x for item in sublist)
  }).reset_index()
  grouped_df.to_csv(f'grouped_{csv}.csv', index=False)

In [24]:
exploded_list = ['exploded_part_2', 'exploded_part_3', 'exploded_part_4', 'exploded_part_5', 
                 'exploded_part_6','exploded_part_7', 'exploded_part_8', 
                 'exploded_part_9', 'exploded_part_10']

In [25]:
for csv in exploded_list:
  group(csv)
  print(f'grouped_{csv} is completed')

grouped_exploded_part_2 is completed
grouped_exploded_part_3 is completed
grouped_exploded_part_4 is completed
grouped_exploded_part_5 is completed
grouped_exploded_part_6 is completed
grouped_exploded_part_7 is completed
grouped_exploded_part_8 is completed
grouped_exploded_part_9 is completed
grouped_exploded_part_10 is completed


: 