# Tuesday Code Challenge

Below is a code snippet that reads in a JSON file containing data extracted from academic papers as a Pandas DataFrame 

In [59]:
import pandas as pd
import json


with open('./documents.json') as f:
    df = pd.DataFrame(json.load(f)).T.drop(columns = 'emails')

df.head()

Unnamed: 0,contents,filename,institutions,people,places
Navigation to Small Bodies,"See discussions, stats, and author profiles fo...",txt_files/Navigation to Small Bodies.txt,"[IEEE Aerospace Conference, Arizona State Univ...","[Raviteja Nallapu, Jekan Thangavelautham, gdek...","[Arizona, Tucson, AZ, Arizona, Tucson, AZ, AZ ..."
ASTRONOMICAL ENGINEERING,ASTRONOMICAL ENGINEERING: A STRATEGY FOR MODIF...,txt_files/ASTRONOMICAL ENGINEERING.txt,[ORBITSD.G. KORYCANSKY CODEP Dept Earth Scienc...,"[CA 95064, GREGORY LAUGHLIN, gpl@acetylene.arc...","[Santa Cruz, CA 94035, U.S.A., Ann Arbor, MI 4..."
Phase II of the Main Belt Asteroid Spectrosopic Survey,"Icarus 158, 146�177 (2002) doi:10.1006/icar.20...",txt_files/Phase II of the Main Belt Asteroid S...,"[Planetary Sciences, Massachusetts Institute o...","[Icarus 158, Richard P. BinzelDepartment, doi,...","[Cambridge, Massachusetts, Hilo, albedos, albe..."
Devlopment of Xenon Hall Thrusters,NASA/CR--2004-213099https://ntrs.nasa.gov/sear...,txt_files/Devlopment of Xenon Hall Thrusters.txt,"[NASA, Characterization of High-Efficiency, Mi...","[Richard R., Thermalized, Gaussmeter.............","[Ann Arbor, Michigan, Ann Arbor, MD, VA, Wien,..."
Mine planning for Asteroid Ore Bodies,Space Resources Roundtable II (2000)7030.pdfMI...,txt_files/Mine planning for Asteroid Ore Bodie...,"[Michigan Technological University, Mining Eng...","[L. S. Gertsch1, R. E. Gertsch2, L. S. Gertsch...","[Houghton, MI, Friable Rock, Hard Rock, Univ, ..."


## 1. Create a new column containing the tokenized contents of each paper

### Tokens should
- Be all lower case characters
- Contain only alpha numeric characters
- Be stored as a list

In [60]:
import spacy
from spacy.tokenizer import Tokenizer
import re

In [61]:
nlp=spacy.load('en_core_web_lg')
tokenize=Tokenizer(nlp.vocab)

In [63]:
tokens=[]
for doc in tokenize.pipe(df['contents']):
    doctokens=[]
    STOP_WORDS=nlp.Defaults.stop_words
    for token in doc:
        if (re.sub(r'[^a-zA-Z0-9]', '', token.text) not in STOP_WORDS):
            if (token.is_punct==False) & (token.text != ''):
                doctokens.append(re.sub(r'[^a-zA-Z0-9]', '', token.text.lower()))
    tokens.append(doctokens)
df['tokens']=tokens  

In [64]:
re.sub(r'[^a-zA-z0-9]', '', tokens[0][6])

'httpswwwresearchgatenetpublication323600217navigating'

In [65]:
df['tokens'][0]

['see',
 'discussions',
 'stats',
 'author',
 'profiles',
 'publication',
 'httpswwwresearchgatenetpublication323600217navigating',
 'smallbodies',
 'using',
 'small',
 'satellitesarticle',
 'ieee',
 'aerospace',
 'conference',
 'proceedings',
 '',
 'march',
 '2018citations07',
 'authors',
 'includingraviteja',
 'nallapu',
 'arizona',
 'state',
 'university',
 '15',
 'publications',
 '34',
 'citationssee',
 'profilee',
 'asphaug',
 'arizona',
 'state',
 'university',
 '384',
 'publications',
 '6733',
 'citationssee',
 'profilereads55pranayreddy',
 'gankidi',
 'arizona',
 'state',
 'university',
 '6',
 'publications',
 '5',
 'citationssee',
 'profilesome',
 'authors',
 'publication',
 'working',
 'related',
 'projects',
 'restoring',
 'dawn',
 'framing',
 'camera',
 'multiband',
 'data',
 'vesta',
 'full',
 'spatial',
 'photometric',
 'accuracy',
 'view',
 'projectcubesat',
 'technology',
 'view',
 'projectall',
 'content',
 'following',
 'page',
 'uploaded',
 'jekan',
 'thangavelautham

## 2. Using regular expressions, create a new column containing the emails from each paper

In [66]:
for doc in tokenize.pipe(df['people']):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-66-eb28e49e8bf1>, line 2)

## 3. Using Spacy, create a new column containing the lemmas from each paper (this might take a second)

In [67]:
lemmas=[]
for doc in tokenize.pipe(df['contents']):
    doctokens=[]
    STOP_WORDS=nlp.Defaults.stop_words
    for token in doc:
        if (re.sub(r'[^a-zA-Z0-9]', '', token.text) not in STOP_WORDS):
            if (token.is_punct==False) & (token.text != ''):
                doctokens.append(re.sub(r'[^a-zA-Z0-9]', '', token.lemma_.lower()))
    lemmas.append(doctokens)
df['lemmas']=lemmas

## 4. Using Spacy, create a column containing the 10 most common words in each paper after stopwords are removed

### Stretch: Remove more common stopwords

In [70]:
from collections import Counter 

def count(x):
    counter=Counter() 
    counter.update(x)
    return counter.most_common(10)

df['wordCounts']=df['lemmas'].apply(count)