In [None]:
import os
import csv
import pandas as pd
import numpy as np

import datetime
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import squarify

from sklearn import model_selection, preprocessing, metrics
plt.style.use('fivethirtyeight')

print(os.getcwd())
print(os.listdir("../"))

In [None]:
def clean_data(df):
    #since first column contains row number
#     df.drop(["Row_num"], axis=1)
    #removing  and kesEntityId as simillar fields are already present
    df = df.drop(columns=["Row_num","kesEntityId"],axis=1)
    df["Popularity"] = df["Popularity"].apply(pd.to_numeric,downcast='float')
    df["Year"] = df["Year"].apply(pd.to_numeric,downcast='unsigned')
    df["Month"] = df["Month"].apply(pd.to_numeric,downcast='unsigned')
    df = df.dropna(subset=['Venue'])
    return df

In [None]:
# df_pub = pd.read_csv("../input/scopus-data/ProcessedScopusData.csv")
# If this doesn't work, please change it to the path where the file is locaed in your PC
# df_pub = pd.read_csv("Data/ProcessedScopusData.csv")

In [None]:
def append_pub_score(df):
    print("Unique Publications from Scopus:",len(df_pub.Title.unique()))
    # Analysing Common Publications
    vfn_list = df.Venue.unique().tolist()
    scopus_list = df_pub.Title.unique().tolist()
    count = 0
    for pub in scopus_list:
        if pub in vfn_list:
            count +=1
    print("Publications present in Dataset:",count)
    print("Shape Before",df.shape)
    df = df[df.Venue.isin(scopus_list)]
    print("Shape After",df.shape)
    #score appending
    score_dict = pd.Series(df_pub.SJR.values,index=df_pub.Title).to_dict()
    df["Publication_Rank"] = df.apply(lambda row: score_dict[row["Venue"]],axis = 1)
    return df

def extract_field(row):
    val = str(row["Domain"])
    index = val.rfind("FN")
    val = val[index+3:len(val)]
    val = val.strip(":}] '")
    return val

def get_num_authors(row):
    vals = row["Authors"].split("},")
    return len(vals)

In [None]:
label_dict = {"C":0,"J":1,"CJ":2,"O":3}
def find_conference_type(row):
    if pd.isnull(row["Conference"]) and pd.isnull(row["Journal"]):
        return label_dict["O"]
    elif pd.isnull(row["Conference"]):
        return label_dict["J"]
    elif pd.isnull(row["Journal"]):
        return label_dict["C"]
    else:
        return label_dict["CJ"]

In [None]:
def tidy_split(df, column, sep='|', keep=False):
    indexes = list()
    id_values = list()
    name_values = list()
    df = df.dropna(subset=[column])
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            id_values.append(presplit)
        for value in values:
            indexes.append(i)
            val = value
            index1 = val.find("AuId")
            index2 = val.find("AfN")
            index3 = val.find("'S'")
            if index3 < index1:
                val = val[index1+4:index2]
                val = val.strip("', :")
            else:
                val = val[index1+4:index3]
                val = val.strip("', :")                
            id_values.append(val)
            index1 = value.find("AuN")
            val = value[index1+3:]
            val = val.strip("': }]")
            name_values.append(val)
    new_df = df.iloc[indexes, :].copy()
    new_df[column+"_Id"] = id_values
    new_new_df = new_df.iloc[:, :].copy()
    new_new_df[column+"_Name"] = name_values
    return new_new_df

In [None]:
def process_year(df):
    # Make more sensible column names
    rename_dict = {"AA": "Authors", 
                   "C": "Conference", 
                   "CC": "Citation_count",
                   "ECC": "Expected_count",
                   "D": "Date", 
                   "F": "Domain", 
                   "J": "Journal", 
                   "RId": "References", 
                   "Ti": "Title", 
                   "VFN": "Venue", 
                   "W": "Words", 
                   "Y": "Year", 
                   "logprob":"Popularity",
                   "Unnamed: 0": "Row_num"}
    df.rename(index=str, columns=rename_dict, inplace=True)
    df.drop_duplicates(subset=['Title'], inplace=True)
    df["Month"] = pd.to_datetime(df["Date"]).dt.month
    df = clean_data(df)
    df = append_pub_score(df)
    df["Publication Type"] = df.apply(lambda row:find_conference_type(row),axis=1)
    df["Number_Of_authors"] = df.apply(lambda row:get_num_authors(row), axis = 1)
    return df

In [None]:
# df = pd.read_csv("../input/mag-2007/mag_2007.csv")

In [None]:
# df = process_year(df)

In [None]:
# df.to_csv("mag_2007_processed.csv")

In [None]:
print(os.listdir("../input/processedmagdata"))

In [None]:
columns = ['Unnamed: 0','Authors', 'Conference','Citation_count','Date','Expected_count','Domain','Id','Journal','References','Title','Venue','Words','Year','Popularity','Month','Publication_Rank','Publication Type','Number_Of_authors']

In [None]:
df = pd.DataFrame(columns = columns)
# please give a path that works on your machine. Keep all the processed files in this path
path = "../input/processedmagdata"
for file in os.listdir(path):
#     print(file)
    df_temp = pd.read_csv(path+"/"+file)
    df = df.append(df_temp)

In [None]:
df.head()

In [None]:
df = df.drop(['Unnamed: 0','kesEntityId'],axis = 1)

In [None]:
df.shape

In [None]:
df["Publication Type"].value_counts()

In [None]:
max(df.Number_Of_authors)

In [None]:
import seaborn as sns
sns.set(style="darkgrid")
ax = sns.countplot(x="Publication Type", data=df)

In [None]:
# import seaborn as sns
# sns.set(style="darkgrid")
# ax = sns.countplot(x="Venue", data=df)

In [None]:
import seaborn as sns
sns.set(style="darkgrid")
ax = sns.countplot(x="Year", data=df)

In [None]:
import seaborn as sns
sns.set(style="darkgrid")
ax = sns.countplot(x="Month", data=df)

In [None]:
df["Topic"] = df.apply(lambda row: extract_field(row),axis = 1)

In [None]:
topic_list = df["Topic"].tolist()
topic_dict = {}
i=0
for topic in topic_list:
    topic_dict[topic] = i
    i += 1

In [None]:
len(topic_dict)

In [None]:
venue_list = df["Venue"].tolist()
venue_dict = {}
i=0
for venue in venue_list:
    venue_dict[venue] = i
    i += 1

In [None]:
# df["Topic_Label"] = df.apply(lambda row: topic_dict[row["Topic"]],axis = 1)
df["Year Since Publication"] = df.apply(lambda row: 2018-row["Year"],axis =1)
df["Venue_label"] = df.apply(lambda row: venue_dict[row["Venue"]],axis = 1)

In [None]:
del venue_list
del topic_list

In [None]:
outlink_map = {}
inlink_map = {}
df_new = df[df["References"].notnull()]
for index, row in df_new.iterrows():
    ref = row["References"]
    ref_list = list(map(int,ref.strip("[]'").split(",")))
    outlink_map[row["Id"]] = ref_list
    for ref in ref_list:
        listi = inlink_map.get(ref, [])
        listi.append(row["Id"])
        inlink_map[ref] = listi

In [None]:
def calculate_page_rank():
    count = 0
    page_year = {row["Id"]:row["Year"] for _t, row in df.iterrows()}
    year_citation_count = { page_year[index]:0 for index in page_year}
    year_paper_count = { page_year[index]:0 for index in page_year}
    avg_year_citation_count = {}
    for index in page_year:
        year = page_year[index]
        len_outlink = len(outlink_map.get(index, []))
        if len_outlink > 0:
            year_citation_count[year] += len_outlink
            year_paper_count[year] += 1
    for year in year_citation_count:
        if year_paper_count[year] > 0:
            avg_year_citation_count[year] = year_citation_count[year]/year_paper_count[year]
    page_rank = {}
    updated_page_rank = {}
    for index in df["Id"]:
        page_rank[index] = 1
    while True:
        count += 1
        flag = True
        for key in page_rank:
            cs = page_rank[key]
            if key in inlink_map:
                inlink_list = inlink_map[key]
                ns = 0
                for link in inlink_list:
                    if link in page_rank and link in outlink_map:
                        ns += page_rank[link]/len(outlink_map[link])
                #ns = 0.15 + (0.85 * ns)
                ns = 0.15 + float(0.85) * (ns/avg_year_citation_count[page_year[key]])
                if cs != ns:
                    flag = False
                updated_page_rank[key] = ns
        if flag == True:
            print(count)
            break
        page_rank = updated_page_rank
        updated_page_rank = {}
        max_score = max(page_rank.values())
        print(f"max score is {max_score}")
        page_rank = { index:score/max_score for index, score in page_rank.items()}
        return page_rank

In [None]:
page_rank = calculate_page_rank()
print(max(page_rank.values()))
print(sum(page_rank.values()))

In [None]:
df["page_rank"] = 0
def update_rank(row):
    if row["Index_Id"] in page_rank:
        return page_rank[row["Index_Id"]]
    else:
        return 0
df["page_rank"] = df.apply(lambda row: update_rank(row),axis = 1)
df["page_rank"][0:10]

In [None]:
df.to_csv("ranked_processed_data.csv")

In [None]:
df = tidy_split(df,"Authors","},")

In [None]:
features = ['Authors_Id','Citation_count','Id','Year','Popularity','Month','Publication_Rank','Publication Type','Number_Of_authors','Topic_Label','Year Since Publication','Venue']

In [None]:
df_train = df[df["Year"] <= 2010]
df_dev = df[df["Year"] == 2011]
df_eval = df[df["Year"] >= 2011]

In [None]:
import lightgbm as lgb 

lgb_params = {"objective" : "regression", "metric" : "rmse",
              "num_leaves" : 70, "learning_rate" : 0.01, 
              "bagging_fraction" : 0.75, "feature_fraction" : 0.8, "bagging_frequency" : 9}
    
lgb_train = lgb.Dataset(lgb_train_x, label=lgb_train_y)
lgb_val = lgb.Dataset(lgb_valid_x, label=lgb_valid_y)
model = lgb.LGBMRegressor(boosting_type='gbdt',num_leaves=31,max_depth=-1, 
learning_rate=0.01, 
n_estimators=1000, 
max_bin=255, 
subsample_for_bin=50000, 
objective=None, 
min_split_gain=0, 
min_child_weight=3,
min_child_samples=10, 
subsample=1, 
subsample_freq=1, 
colsample_bytree=1, 
reg_alpha=0.1, 
reg_lambda=0, 
seed=17,
silent=False, 
nthread=-1)
model.fit(df_train[features], df_train["page_rank"], eval_set=[(df_dev[features], df_test["page_rank"])])

In [None]:
pred_dev = model.predict(df_test[features])
pred_test = model.predict(df_test[features])

In [None]:
df["pred_rank"] = pred_test

In [None]:
import math

def sigmoid(x):
  return 1 / (1 + math.exp(-x))

In [None]:
df_eval['prob'] = df_eval.apply (lambda row: sigmoid(row["pred_rank"]),axis=1)
df_eval = df_eval.sort_values(by=['prob'],ascending=False)
print(df_eval[0:100])

In [None]:
imp_list = model.feature_importances_

In [None]:
num = sum(imp_list)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importances_,train_x.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances-01.png')