In [1]:
import os
import csv
import pandas as pd
import numpy as np

import datetime
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import squarify

from sklearn import model_selection, preprocessing, metrics
plt.style.use('fivethirtyeight')

print(os.getcwd())
print(os.listdir("../input/"))

/kaggle/working
['acm.csv']


In [2]:
#load dataframe from csv
df = pd.read_csv("../input/acm.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2385066 entries, 0 to 2385065
Data columns (total 7 columns):
PaperTitle           object
Authors              object
Year                 float64
Publication_Venue    object
Index_Id             int64
References_Id        object
Abstract             object
dtypes: float64(1), int64(1), object(5)
memory usage: 127.4+ MB


In [None]:
df

In [None]:
df[df["Index_Id"] == 289023]

***count_reference*** represents the number of times particular paper got referred in our dataset

In [5]:
df["count_reference"] = 0

In [6]:
df.isna().sum()

PaperTitle                46
Authors               151075
Year                    8428
Publication_Venue        139
Index_Id                   0
References_Id        1344739
Abstract              713788
count_reference            0
dtype: int64

####YEAR

In [None]:
df["Year"] = df["Year"].fillna(0)
df["Year"] = df["Year"].astype(int)

**Rows with NaN year**

In [None]:
df[df["Year"] == 0]

***Year of publication distribution in the data***

In [None]:
ydf = df[df["Year"]!=0]
plt.figure(figsize=(8,6))
plt.scatter(range(ydf.shape[0]), np.sort(ydf["Year"].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Year', fontsize=12)
plt.title("Year Distribution", fontsize=20) 
plt.show()

In [None]:
df.isna().sum()

In [None]:
print(df.dtypes)

In [None]:
df["Index_Id"].unique()
print(any(df['Index_Id'].duplicated())) 
print(len(df['Index_Id'].unique()))
print(len(df.index))

> Index_Id = 2381731 is duplicated 9 times , It points to 7 different papers and has 82080 references.

In [None]:
df[df.duplicated(subset="Index_Id")]

In [7]:
reference_count = {}
for references in df[df["References_Id"].notnull()]["References_Id"]:
    ref_list = map(int,references.strip(":").split(":"))
    for ref in ref_list:
        if ref in reference_count:
            reference_count[ref] += 1
        else:
            reference_count[ref] = 1
print(reference_count[2381731])

82080


In [8]:
def update_ref_count(row):
    if row['Index_Id'] in reference_count:
        return reference_count[row['Index_Id']]
    else:
        return 0
df["count_reference"] = df.apply(lambda row: update_ref_count(row),axis=1)

In [None]:
print(df.head(5))

In [None]:
sorted_by_value = sorted(reference_count.items(), key=lambda kv: kv[1],reverse=True)
print(sorted_by_value[:10])
print([row[0] for row in sorted_by_value[:10]])
print([row[1] for row in sorted_by_value[:10]])

In [None]:
x_values = [row[0] for row in sorted_by_value[:10]]
y_values = [row[1] for row in sorted_by_value[:10]]
x_pos = np.arange(len(x_values))
plt.figure(figsize=(13,7))
plt.bar(x_pos,y_values, align='center', alpha=1)
plt.xticks(x_pos,x_values)
plt.ylabel('reference count')
plt.title('Top 10 paper by reference')
 
plt.show()

In [None]:
print(df.shape)
df.isna().sum()

In [None]:
df["Authors"] = df["Authors"].fillna('')
print(any(df["Authors"].isna()))
df[df['Authors'].str.contains('-')]

In [None]:
print(df["Authors"].shape)
print(df[df['Authors'].str.isalpha()].shape)

In [None]:
df[df['Authors'].str.isalpha()]

In [9]:
del reference_count
outlink_map = {}
inlink_map = {}
df_new = df[df["References_Id"].notnull()]
for index, row in df_new.iterrows():
    ref = row["References_Id"]
    ref_list = list(map(int,ref.strip(":").split(":")))
    outlink_map[row["Index_Id"]] = ref_list
    for ref in ref_list:
        listi = inlink_map.get(ref, [])
        listi.append(row["Index_Id"])
        inlink_map[ref] = listi

In [11]:
def calculate_page_rank():
    count = 0
    page_rank = {}
    updated_page_rank = {}
    for index in df["Index_Id"]:
        page_rank[index] = 1
    while True:
        count += 1
        flag = True
        for key in page_rank:
            cs = page_rank[key]
            if key in inlink_map:
                inlink_list = inlink_map[key]
                ns = 0
                for link in inlink_list:
                    if link in page_rank and link in outlink_map:
                        ns += page_rank[link]/len(outlink_map[link])
                ns = 0.15 + (0.85 * ns)
                if cs != ns:
                    flag = False
                updated_page_rank[key] = ns
        if flag == True:
            print(count)
            break
        page_rank = updated_page_rank
        updated_page_rank = {}
        return page_rank

In [12]:
page_rank = calculate_page_rank()
del outlink_map
del inlink_map

In [13]:
page_rank

{3: 0.23541666666666666,
 4: 0.4313095238095238,
 5: 2.0422619047619044,
 8: 0.4172596153846153,
 9: 2.614018759018759,
 10: 0.3625,
 12: 0.3585648148148148,
 13: 0.7200396825396825,
 15: 1.5747097070437572,
 16: 0.3303030303030303,
 17: 2.0427805756027833,
 19: 0.20666666666666667,
 22: 0.4492171717171717,
 24: 0.3625,
 25: 1.3128558627752174,
 27: 0.25024570024570025,
 30: 0.29166666666666663,
 34: 0.1688888888888889,
 39: 0.23348214285714286,
 43: 3.310351741032292,
 44: 0.9291666666666666,
 45: 0.203125,
 46: 1.1197915339491427,
 47: 0.15643939393939393,
 48: 0.40471732215153267,
 49: 0.43623106060606054,
 50: 0.3388888888888889,
 51: 0.7306281218781218,
 52: 16.47767654167661,
 53: 1.5575982619086888,
 55: 0.8383898770311815,
 56: 1.0455529655529654,
 59: 3.2945478644162853,
 60: 0.9872132034632035,
 63: 0.17656249999999998,
 64: 0.3625,
 65: 1.0753679330726744,
 72: 1.0417735042735041,
 75: 3.7153294659220792,
 76: 0.575,
 77: 0.35502747252747247,
 78: 0.7554440089043748,
 79: 4.

In [14]:
df["page_rank"] = 0

In [16]:
def update_rank(row):
    if row["Index_Id"] in page_rank:
        return page_rank[row["Index_Id"]]
    else:
        return 0

In [17]:
df["page_rank"] = df.apply(lambda row: update_rank(row),axis = 1)

In [18]:
df["page_rank"][0:10]

0    0.000000
1    0.000000
2    0.235417
3    0.431310
4    2.042262
5    0.000000
6    0.000000
7    0.417260
8    2.614019
9    0.362500
Name: page_rank, dtype: float64

In [None]:
df.to_csv("ranked_acm_1.csv")