# scrape data
in order to work, we require that the data is saved to a csv file with one piece of text (e.g. one paper title) on each line

**scrape arxiv for titles**

In [None]:
import arxivscraper as ax
import numpy as np

'''
# scraper for arxiv stat.ml
scraper = ax.Scraper(category='stat', date_from='2017-08-01',
                     date_until='2019-07-01', t=10, 
                     filters={'categories':['stat.ml'],'abstract':['learning']})

# scraper for arxiv q-bio
scraper = ax.Scraper(category='q-bio', date_from='2016-08-01',
                     date_until='2019-07-01', t=10, 
                     filters={'categories':['q-bio.GN', 'q-bio.NC']})
'''
# scraper for arxiv physics
scraper = ax.Scraper(category='physics', date_from='2019-05-01',
                     date_until='2019-07-03', t=10,
                     filters={'categories':['quant-ph']})

output = scraper.scrape()


# cols = ('id', 'title', 'categories', 'abstract', 'doi', 'created', 'updated', 'authors')
titles = [' '.join(o['title'].split()) for o in output]
np.savetxt('titles.csv', np.array(titles), fmt='%s')

**alternatively, scrape something else**

In [3]:
import urllib
import numpy as np

# scrape some interesting quotes
url = 'https://raw.githubusercontent.com/akhiltak/inspirational-quotes/master/Quotes.csv'
response = urllib.request.urlopen(url).read().decode()
quotes = []
lines = response.split('\n')
for line in lines[:-1]:
    quotes.append(line.split(';')[0].replace("\'", '').replace('*', '').replace('#', '').replace('%', '').replace('&', ''))
    
np.savetxt('titles.csv', np.array(quotes[1:]), fmt='%s')

**e.g. could scrape tweets (requires having twitter api credentials)**

In [25]:
import tweepy #https://github.com/tweepy/tweepy
import csv
import pandas as pd
import re

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""


def get_all_tweets(screen_name):
    #Twitter only allows access to a users most recent 3240 tweets with this method
    
    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_key, access_secret)
    api = tweepy.API(auth)
    
    #initialize a list to hold all the tweepy Tweets
    alltweets = []  
    
    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name = screen_name,count=200)
    
    #save most recent tweets
    alltweets.extend(new_tweets)
    
    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1
    
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print(f"getting tweets before {oldest}")
        
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
        
        #save most recent tweets
        alltweets.extend(new_tweets)
        
        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1
        
        print(f"...{len(alltweets)} tweets downloaded so far")
    
    #transform the tweepy tweets into a 2D array that will populate the csv 
    outtweets = [[tweet.id_str, tweet.created_at, tweet.text] for tweet in alltweets]
    
    #write the csv  
    with open(f'new_{screen_name}_tweets.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["id","created_at","text"])
        writer.writerows(outtweets)
    
    pass

def clean_csv(fname='new_SICKOFWOLVES_tweets.csv'):
    df = pd.read_csv(fname)
    df = df[~df['text'].str.contains('STORE')]
    df = df[~df['text'].str.startswith('@')]
    df['text'] = df['text'].apply(lambda x: re.split('https:\/\/.*', str(x))[0]) # remove urls
    df['text'].to_csv('data/wolves_tweets.csv', index=False)
# get_all_tweets("SICKOFWOLVES")
clean_csv()

# finetune gpt2
this code will download gpt2 and finetune it on the file title.csv, generating samples at intermediate steps

In [None]:
import gpt_2_simple as gpt2

model_name = "117M"
gpt2.download_gpt2(model_name=model_name)   # model is saved into current directory under /models/117M/

sess = gpt2.start_tf_sess()
gpt2.finetune(sess,
              'titles.csv',
              model_name=model_name,
              steps=1000,
              save_every=200,
              sample_every=25)   # steps is max number of training steps

gpt2.generate(sess)

# look at some samples
the samples are saved to the 'samples' folder by default

In [None]:
sample_file = 'samples/samples-901'
t = open(sample_file, 'r').read()

for s in ['endoftext', 'startoftext', '<|', '|>']:
    t = t.replace(s, '')
for title in t.title().split('\n')[1:]:
    if not title == '':
        print('- ' + title)

# generating new samples from the finetuned model

In [None]:
import gpt_2_simple as gpt2
sess = gpt2.start_tf_sess()
gpt2.load_gpt2(sess)

**generate one sample**

In [23]:
prefix = 'neural' # None is default
text = gpt2.generate(sess,
              length=40,
              temperature=0.7,
              prefix=neural,
              nsamples=1,
              batch_size=1,
              return_as_list=True
             )


t = text[0].title()
t = t.replace('<|Startoftext|>', '').replace('\n', '') # remove extraneous stuff
t = t[:t.index('<|Endoftext|>')] # only get one title
print(t)

Neural Source Separation Via Non-Negative Eigenvector Field Variate Operator


**generate a bunch of samples**

In [None]:
text = gpt2.generate(sess,
#               length=40,
              temperature=0.7,
              prefix=None,
              nsamples=100,
              batch_size=1,
              return_as_list=True
             )


t = text[0].title()
t = t.replace('<|Startoftext|>', '').replace('\n', '') # remove extraneous stuff
t = t[:t.index('<|Endoftext|>')] # only get one title
print(t)