# Table of Contents
* [Loading data](#Loading-data)
	* [derived columns](#derived-columns)
	* [creating target_labels](#creating-target_labels)
* [Venue/Journal info](#Venue/Journal-info)
* [missing values](#missing-values)
	* [dropping meaningless columns](#dropping-meaningless-columns)
* [Exploring features](#Exploring-features)


In [141]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import json
import glob

In [2]:
%%capture
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light default
plt.rcParams['figure.figsize'] = (16.0, 10.0)

# Loading data

In [13]:
sample_data_dir = './data/corpus_1_pct_sample/'
test_file = sample_data_dir + 'part-00000'

In [18]:
corpus_sample_raw_df = pd.DataFrame()

In [20]:
for file in glob.glob(sample_data_dir + '*'):
    running_idx = 0
    with open(file, 'r') as f:
        json_lines = f.readlines()
        record_by_index = {running_idx + idx: json.loads(record) for idx, record in enumerate(json_lines)}
        running_idx += len(json_lines)
        if corpus_sample_raw_df.empty:
            corpus_sample_raw_df = pd.DataFrame.from_dict(record_by_index, orient='index')
        else:
            corpus_sample_raw_df = corpus_sample_raw_df.append(pd.DataFrame.from_dict(record_by_index, orient='index'))

In [22]:
corpus_sample_raw_df.shape

In [24]:
sorted(list(corpus_sample_raw_df.columns))

## derived columns

In [131]:
corpus_sample_raw_df['number_authors'] = corpus_sample_raw_df['authors'].apply(lambda x: len(x))

## creating target_labels

In [140]:
NS_venues = ['The Journal of neuroscience : the official journal of the Society for Neuroscience', 'NEUROIMAGE', 'Frontiers in Neuroscience']
CS_venues = ['HICSS', 'CVPR', 'SIGGRAPH']

# Venue/Journal info

In [134]:
len(pd.unique(corpus_sample_raw_df['venue']))

In [142]:
corpus_sample_raw_df['venue'].value_counts()[:15]

In [53]:
vj_df = corpus_sample_raw_df[['venue', 'journal']].dropna()

In [59]:
vj_df_hasname = vj_df[vj_df['journal'].apply(lambda x: 'name'  in x.keys())]

In [66]:
%%capture
vj_df_hasname['journal_name'] = vj_df_hasname['journal'].apply(lambda x: x['name'])

In [81]:
vj_df_name_nn = vj_df_hasname[vj_df_hasname['journal_name'] != ''][vj_df_hasname['venue'] != ''][['venue', 'journal_name']]

In [109]:
venue_lookup = defaultdict(list)
for venue, journal_df in vj_df_name_nn.groupby('venue'):
    venue_lookup[venue].append(pd.unique(journal_df['journal_name']).tolist())

In [105]:
pd.Series(venue_lookup).to_csv('journal_lookup.csv')

In [33]:
(corpus_sample_raw_df['venue'] == "").sum()

In [106]:
corpus_sample_raw_df.isnull().sum()/corpus_sample_raw_df.shape[0]

# missing values

## dropping meaningless columns

In [129]:
all_cols = corpus_sample_raw_df.columns
all_cols

In [130]:
keep_cols = ['bodyText', 'paperAbstract', 'paperAbstract']
synth_cols = ['number_authors']

In [120]:
corpus_sample_raw_df.replace(['',[]], np.NaN)

# Exploring features

In [138]:
_ = corpus_sample_raw_df['number_authors'].hist(log = True, bins = 30)