# Table of Contents
* [Loading data](#Loading-data)
* [derived columns](#derived-columns)
	* [creating target_labels](#creating-target_labels)
* [Venue/Journal info](#Venue/Journal-info)
* [missing values](#missing-values)
	* [dropping meaningless columns](#dropping-meaningless-columns)
* [Exploring features](#Exploring-features)


In [141]:
%%capture
from __future__ import division
import numpy as np
import pandas as pd
import scipy.stats as st
import itertools
import math
from collections import Counter, defaultdict
%load_ext autoreload
%autoreload 2

import json
import glob

In [196]:
%%capture
import matplotlib as mpl
mpl.use("Agg")
import matplotlib.pylab as plt
#%matplotlib notebook
%matplotlib inline
%load_ext base16_mplrc
%base16_mplrc light default
plt.rcParams['figure.figsize'] = (16.0, 10.0)
import seaborn as sns
sns.set()

# Loading data

In [13]:
sample_data_dir = './data/corpus_1_pct_sample/'
test_file = sample_data_dir + 'part-00000'

In [18]:
corpus_sample_raw_df = pd.DataFrame()

In [20]:
for file in glob.glob(sample_data_dir + '*'):
    running_idx = 0
    with open(file, 'r') as f:
        json_lines = f.readlines()
        record_by_index = {running_idx + idx: json.loads(record) for idx, record in enumerate(json_lines)}
        running_idx += len(json_lines)
        if corpus_sample_raw_df.empty:
            corpus_sample_raw_df = pd.DataFrame.from_dict(record_by_index, orient='index')
        else:
            corpus_sample_raw_df = corpus_sample_raw_df.append(pd.DataFrame.from_dict(record_by_index, orient='index'))

In [22]:
corpus_sample_raw_df.shape

In [24]:
sorted(list(corpus_sample_raw_df.columns))

# derived columns

In [205]:
corpus_sample_raw_df['author_len'] = corpus_sample_raw_df['authors'].apply(lambda x: len(x))

## creating target_labels

In [161]:
def assign_fos(venue):
    NS_venues = ['The Journal of neuroscience : the official journal of the Society for Neuroscience', 'NEUROIMAGE', 'Frontiers in Neuroscience']
    CS_venues = ['HICSS', 'CVPR', 'SIGGRAPH']
    if venue in NS_venues:
        return 'NS'
    elif venue in CS_venues:
        return 'CS'
    else:
        return np.NaN

In [206]:
corpus_sample_raw_df['FOS'] = corpus_sample_raw_df['venue'].apply(assign_fos)

In [207]:
corpus_sample_raw_df['FOS'].value_counts()

# Venue/Journal info

In [134]:
len(pd.unique(corpus_sample_raw_df['venue']))

In [142]:
corpus_sample_raw_df['venue'].value_counts()[:15]

In [53]:
vj_df = corpus_sample_raw_df[['venue', 'journal']].dropna()

In [59]:
vj_df_hasname = vj_df[vj_df['journal'].apply(lambda x: 'name'  in x.keys())]

In [66]:
%%capture
vj_df_hasname['journal_name'] = vj_df_hasname['journal'].apply(lambda x: x['name'])

In [81]:
vj_df_name_nn = vj_df_hasname[vj_df_hasname['journal_name'] != ''][vj_df_hasname['venue'] != ''][['venue', 'journal_name']]

In [109]:
venue_lookup = defaultdict(list)
for venue, journal_df in vj_df_name_nn.groupby('venue'):
    venue_lookup[venue].append(pd.unique(journal_df['journal_name']).tolist())

In [105]:
pd.Series(venue_lookup).to_csv('journal_lookup.csv')

# missing values

In [33]:
(corpus_sample_raw_df['venue'] == "").sum()

In [106]:
corpus_sample_raw_df.isnull().sum()/corpus_sample_raw_df.shape[0]

## dropping bad columns

In [226]:
all_cols = list(corpus_sample_raw_df.columns)
keep_cols = ['bodyText', 'paperAbstract', 'title']
synth_cols = ['author_len', 'FOS']

In [209]:
corpus_sample_proc_df = corpus_sample_raw_df.copy()
for col in all_cols:
    if col not in keep_cols + synth_cols:    
        corpus_sample_proc_df.drop(col, 1, inplace=True)

In [210]:
corpus_sample_labeled_df = corpus_sample_proc_df.dropna()
corpus_sample_labeled_df.shape

In [212]:
labeled_df.head(2)

In [None]:
labeled_df['title_len'] = labeled_df['title'].apply(lambda x: len(x))

# Exploring features

In [238]:
labeled_df = corpus_sample_labeled_df.replace('', '0')
labeled_df['target'] = labeled_df['FOS'] == 'CS'
# _ = labeled_df['author_len'].hist(log = False, bins = 30)

In [239]:
labeled_df['title_len'] = labeled_df['title'].apply(lambda x: np.log(len(x)))
labeled_df['abs_len'] = labeled_df['paperAbstract'].apply(lambda x: np.log(len(x)))
labeled_df['body_len'] = labeled_df['bodyText'].apply(lambda x: np.log(len(x)))
labeled_df['all_text'] = labeled_df['title'].str.cat(labeled_df['paperAbstract'], ' ').str.cat(labeled_df['bodyText'], ' ')
for col in keep_cols:
    labeled_df.drop(col, 1, inplace=True)

In [240]:
colors = ["windows blue", "amber"]
my_pal = sns.xkcd_palette(colors)
_ = sns.pairplot(labeled_df, hue = 'target', palette = my_pal)

# End 

In [None]:
# corpus_sample_raw_df.replace(['',[]], np.NaN)